Author Topic: How is a QB64 exe file made?  (Read 5414 times)

0 Members and 1 Guest are viewing this topic.

Offline _vince

  • Seasoned Forum Regular
  • Posts: 422
    • View Profile
Re: How is a QB64 exe file made?
« Reply #15 on: February 24, 2020, 08:18:05 pm »
I like the idea of a basic that outputs assembler. That way after having a working program I could rewrite sections one at a time for better performance.
freebasic can output the full asm file right before it gets assembled, but it also has inline assembly support so you'd never care to modify it anyway (Not that you should ever try to hand optimize modern asm)

Here's an excerpt:
Code: [Select]
const sw = 800
const sh = 600
dim shared as double pi = 2*asin(1)

screenres sw, sh, 32

i=0
pset (200*cos(2*pi*i/5) + sw/2, sh/2 - 200*sin(2*pi*i/5))
for i=0 to 5
line -(200*cos(2*pi*i*2/5) + sw/2, sh/2 - 200*sin(2*pi*i*2/5))
next

sleep
system

excerpt from the .asm file
Code: [Select]
main:
.LFB0:
.file 1 "star.bas"
.loc 1 1 1
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
mov rbp, rsp
.cfi_def_cfa_register 6
sub rsp, 48
mov DWORD PTR -36[rbp], edi
mov QWORD PTR -48[rbp], rsi
.loc 1 1 1
mov rax, QWORD PTR fs:40
mov QWORD PTR -8[rbp], rax
xor eax, eax
.loc 1 1 2
mov DWORD PTR -20[rbp], 0
.loc 1 1 2
mov QWORD PTR -16[rbp], 0
.loc 1 1 2
mov rcx, QWORD PTR -48[rbp]
mov eax, DWORD PTR -36[rbp]
mov edx, 2
mov rsi, rcx
mov edi, eax
call fb_Init@PLT
.L2:
.loc 1 6 2
mov r9d, 0
mov r8d, 0
mov ecx, 1
mov edx, 32
mov esi, 600
mov edi, 800
call fb_GfxScreenRes@PLT
.loc 1 8 6
mov QWORD PTR -16[rbp], 0
.loc 1 9 157
mov rax, QWORD PTR -16[rbp]
cvtsi2sd xmm1, rax
.loc 1 9 155
movsd xmm0, QWORD PTR PI$[rip]
mulsd xmm0, xmm1
.loc 1 9 170
addsd xmm0, xmm0
.loc 1 9 133
movsd xmm1, QWORD PTR .LC0[rip]
divsd xmm0, xmm1
call sin@PLT
movapd xmm1, xmm0
.loc 1 9 195
movsd xmm0, QWORD PTR .LC1[rip]
mulsd xmm1, xmm0
.loc 1 9 207
movsd xmm0, QWORD PTR .LC2[rip]
subsd xmm0, xmm1
.loc 1 9 2
cvtsd2ss xmm4, xmm0
movss DWORD PTR -40[rbp], xmm4
.loc 1 9 60
mov rax, QWORD PTR -16[rbp]
cvtsi2sd xmm1, rax
.loc 1 9 58
movsd xmm0, QWORD PTR PI$[rip]
mulsd xmm0, xmm1
.loc 1 9 73
addsd xmm0, xmm0
.loc 1 9 36
movsd xmm1, QWORD PTR .LC0[rip]
divsd xmm0, xmm1
call cos@PLT
movapd xmm1, xmm0
.loc 1 9 98
movsd xmm0, QWORD PTR .LC1[rip]
mulsd xmm1, xmm0
.loc 1 9 110
movsd xmm0, QWORD PTR .LC3[rip]
addsd xmm0, xmm1
.loc 1 9 2
cvtsd2ss xmm0, xmm0
mov ecx, 0
mov edx, -2147483644
mov esi, 0
movss xmm1, DWORD PTR -40[rbp]
mov edi, 0
call fb_GfxPset@PLT
.loc 1 10 7
mov QWORD PTR -16[rbp], 0
.L3:
.loc 1 11 177
mov rax, QWORD PTR -16[rbp]
cvtsi2sd xmm1, rax
.loc 1 11 175
movsd xmm0, QWORD PTR PI$[rip]
mulsd xmm1, xmm0
.loc 1 11 190
movsd xmm0, QWORD PTR .LC4[rip]
mulsd xmm0, xmm1
.loc 1 11 153
movsd xmm1, QWORD PTR .LC0[rip]
divsd xmm0, xmm1
call sin@PLT
movapd xmm1, xmm0
.loc 1 11 215
movsd xmm0, QWORD PTR .LC1[rip]
mulsd xmm1, xmm0
.loc 1 11 227
movsd xmm0, QWORD PTR .LC2[rip]
subsd xmm0, xmm1
.loc 1 11 4
cvtsd2ss xmm5, xmm0
movss DWORD PTR -40[rbp], xmm5
.loc 1 11 80
mov rax, QWORD PTR -16[rbp]
cvtsi2sd xmm1, rax
.loc 1 11 78
movsd xmm0, QWORD PTR PI$[rip]
mulsd xmm1, xmm0
.loc 1 11 93
movsd xmm0, QWORD PTR .LC4[rip]
mulsd xmm0, xmm1
.loc 1 11 56
movsd xmm1, QWORD PTR .LC0[rip]
divsd xmm0, xmm1
call cos@PLT
movapd xmm1, xmm0
.loc 1 11 118
movsd xmm0, QWORD PTR .LC1[rip]
mulsd xmm1, xmm0
.loc 1 11 130
movsd xmm0, QWORD PTR .LC3[rip]
addsd xmm0, xmm1
.loc 1 11 4
cvtsd2ss xmm0, xmm0
mov r8d, -2147483646
mov ecx, 65535
mov edx, 0
mov esi, 0
movss xmm3, DWORD PTR -40[rbp]
movaps xmm2, xmm0
pxor xmm1, xmm1
pxor xmm0, xmm0
mov edi, 0
call fb_GfxLine@PLT
.L4:
.loc 1 12 13
mov rax, QWORD PTR -16[rbp]
add rax, 1
.loc 1 12 7
mov QWORD PTR -16[rbp], rax
.L5:
.loc 1 12 11
mov rax, QWORD PTR -16[rbp]
.loc 1 12 5
cmp rax, 5
jg .L10
.loc 1 12 20 discriminator 2
jmp .L3
.L10:
.loc 1 12 3
nop
.L6:
.loc 1 14 2
mov edi, -1
call fb_Sleep@PLT
.loc 1 15 2
mov edi, 0
call fb_End@PLT
.L7:
.loc 1 15 2
mov edi, 0
call fb_End@PLT
.loc 1 15 9
mov eax, DWORD PTR -20[rbp]
.loc 1 15 1
mov rdx, QWORD PTR -8[rbp]
xor rdx, QWORD PTR fs:40
je .L9
call __stack_chk_fail@PLT
.L9:
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc

It can also output a C source file, it is a sort of 'low level C' that allows freebasic to be multiplatform, though im not an expert on freebasic internals, quite interesting though
Code: [Select]
typedef   signed char       int8;
typedef unsigned char      uint8;
typedef   signed short      int16;
typedef unsigned short     uint16;
typedef   signed int        int32;
typedef unsigned int       uint32;
typedef   signed long long  int64;
typedef unsigned long long uint64;
typedef struct { char *data; int64 len; int64 size; } FBSTRING;
typedef int8 boolean;
#line 15 "star.bas"
void fb_GfxPset( void*, float, float, uint32, int32, int32 );
#line 15 "star.bas"
void fb_GfxLine( void*, float, float, float, float, uint32, int32, uint32, int32 );
#line 15 "star.bas"
int32 fb_GfxScreenRes( int32, int32, int32, int32, int32, int32 );
#line 15 "star.bas"
void fb_Init( int32, uint8**, int32 );
#line 15 "star.bas"
void fb_End( int32 );
#line 15 "star.bas"
void fb_End( int32 );
#line 15 "star.bas"
void fb_Sleep( int32 );
#line 15 "star.bas"
static double PI$ = 0x1.921FB54442D18p+1;

#line 1 "star.bas"
int32 main( int32 __FB_ARGC__$0, char** __FB_ARGV__$0 )
#line 1 "star.bas"
{
#line 1 "star.bas"
int32 fb$result$0;
#line 1 "star.bas"
__builtin_memset( &fb$result$0, 0, 4ll );
#line 1 "star.bas"
int64 I$0;
#line 1 "star.bas"
__builtin_memset( &I$0, 0, 8ll );
#line 1 "star.bas"
fb_Init( __FB_ARGC__$0, (uint8**)__FB_ARGV__$0, 2 );
#line 1 "star.bas"
label$0:;
// #lang "fblite"
// const sw = 800
// const sh = 600
// dim shared as double pi = 2*asin(1)
// screenres sw, sh, 32
#line 6 "star.bas"
fb_GfxScreenRes( 800, 600, 32, 1, 0, 0 );
// i=0
#line 8 "star.bas"
I$0 = 0ll;
// pset (200*cos(2*pi*i/5) + sw/2, sh/2 - 200*sin(2*pi*i/5))
#line 9 "star.bas"
fb_GfxPset( (void*)0ull, (float)((__builtin_cos( (((PI$ * (double)I$0) * 0x1.p+1) / 0x1.4p+2) ) * 0x1.9p+7) + 0x1.9p+8), (float)(-(__builtin_sin( (((PI$ * (double)I$0) * 0x1.p+1) / 0x1.4p+2) ) * 0x1.9p+7) + 0x1.2Cp+8), 0u, -2147483644, 0 );
// for i=0 to 5
{
#line 10 "star.bas"
I$0 = 0ll;
#line 10 "star.bas"
label$5:;
{
// line -(200*cos(2*pi*i*2/5) + sw/2, sh/2 - 200*sin(2*pi*i*2/5))
#line 11 "star.bas"
fb_GfxLine( (void*)0ull, 0x0p+0f, 0x0p+0f, (float)((__builtin_cos( (((PI$ * (double)I$0) * 0x1.p+2) / 0x1.4p+2) ) * 0x1.9p+7) + 0x1.9p+8), (float)(-(__builtin_sin( (((PI$ * (double)I$0) * 0x1.p+2) / 0x1.4p+2) ) * 0x1.9p+7) + 0x1.2Cp+8), 0u, 0, 65535u, -2147483646 );
// next
}
#line 12 "star.bas"
label$3:;
#line 12 "star.bas"
I$0 = I$0 + 1ll;
#line 12 "star.bas"
label$2:;
#line 12 "star.bas"
if( I$0 <= 5ll ) goto label$5;
#line 12 "star.bas"
label$4:;
}
// sleep
#line 14 "star.bas"
fb_Sleep( -1 );
// system
#line 15 "star.bas"
fb_End( 0 );
#line 15 "star.bas"
label$1:;
#line 15 "star.bas"
fb_End( 0 );
#line 15 "star.bas"
return fb$result$0;
#line 15 "star.bas"
}
« Last Edit: February 24, 2020, 08:19:38 pm by _vince »

Offline romichess

  • Forum Regular
  • Posts: 145
    • View Profile
Re: How is a QB64 exe file made?
« Reply #16 on: February 24, 2020, 09:26:11 pm »
freebasic can output the full asm file right before it gets assembled, but it also has inline assembly support so you'd never care to modify it anyway (Not that you should ever try to hand optimize modern asm)

Here's an excerpt:
Code: [Select]
const sw = 800
const sh = 600
dim shared as double pi = 2*asin(1)

screenres sw, sh, 32

i=0
pset (200*cos(2*pi*i/5) + sw/2, sh/2 - 200*sin(2*pi*i/5))
for i=0 to 5
line -(200*cos(2*pi*i*2/5) + sw/2, sh/2 - 200*sin(2*pi*i*2/5))
next

sleep
system

excerpt from the .asm file
Code: [Select]
main:
.LFB0:
.file 1 "star.bas"
.loc 1 1 1
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
mov rbp, rsp
.cfi_def_cfa_register 6
sub rsp, 48
mov DWORD PTR -36[rbp], edi
mov QWORD PTR -48[rbp], rsi
.loc 1 1 1
mov rax, QWORD PTR fs:40
mov QWORD PTR -8[rbp], rax
xor eax, eax
.loc 1 1 2
mov DWORD PTR -20[rbp], 0
.loc 1 1 2
mov QWORD PTR -16[rbp], 0
.loc 1 1 2
mov rcx, QWORD PTR -48[rbp]
mov eax, DWORD PTR -36[rbp]
mov edx, 2
mov rsi, rcx
mov edi, eax
call fb_Init@PLT
.L2:
.loc 1 6 2
mov r9d, 0
mov r8d, 0
mov ecx, 1
mov edx, 32
mov esi, 600
mov edi, 800
call fb_GfxScreenRes@PLT
.loc 1 8 6
mov QWORD PTR -16[rbp], 0
.loc 1 9 157
mov rax, QWORD PTR -16[rbp]
cvtsi2sd xmm1, rax
.loc 1 9 155
movsd xmm0, QWORD PTR PI$[rip]
mulsd xmm0, xmm1
.loc 1 9 170
addsd xmm0, xmm0
.loc 1 9 133
movsd xmm1, QWORD PTR .LC0[rip]
divsd xmm0, xmm1
call sin@PLT
movapd xmm1, xmm0
.loc 1 9 195
movsd xmm0, QWORD PTR .LC1[rip]
mulsd xmm1, xmm0
.loc 1 9 207
movsd xmm0, QWORD PTR .LC2[rip]
subsd xmm0, xmm1
.loc 1 9 2
cvtsd2ss xmm4, xmm0
movss DWORD PTR -40[rbp], xmm4
.loc 1 9 60
mov rax, QWORD PTR -16[rbp]
cvtsi2sd xmm1, rax
.loc 1 9 58
movsd xmm0, QWORD PTR PI$[rip]
mulsd xmm0, xmm1
.loc 1 9 73
addsd xmm0, xmm0
.loc 1 9 36
movsd xmm1, QWORD PTR .LC0[rip]
divsd xmm0, xmm1
call cos@PLT
movapd xmm1, xmm0
.loc 1 9 98
movsd xmm0, QWORD PTR .LC1[rip]
mulsd xmm1, xmm0
.loc 1 9 110
movsd xmm0, QWORD PTR .LC3[rip]
addsd xmm0, xmm1
.loc 1 9 2
cvtsd2ss xmm0, xmm0
mov ecx, 0
mov edx, -2147483644
mov esi, 0
movss xmm1, DWORD PTR -40[rbp]
mov edi, 0
call fb_GfxPset@PLT
.loc 1 10 7
mov QWORD PTR -16[rbp], 0
.L3:
.loc 1 11 177
mov rax, QWORD PTR -16[rbp]
cvtsi2sd xmm1, rax
.loc 1 11 175
movsd xmm0, QWORD PTR PI$[rip]
mulsd xmm1, xmm0
.loc 1 11 190
movsd xmm0, QWORD PTR .LC4[rip]
mulsd xmm0, xmm1
.loc 1 11 153
movsd xmm1, QWORD PTR .LC0[rip]
divsd xmm0, xmm1
call sin@PLT
movapd xmm1, xmm0
.loc 1 11 215
movsd xmm0, QWORD PTR .LC1[rip]
mulsd xmm1, xmm0
.loc 1 11 227
movsd xmm0, QWORD PTR .LC2[rip]
subsd xmm0, xmm1
.loc 1 11 4
cvtsd2ss xmm5, xmm0
movss DWORD PTR -40[rbp], xmm5
.loc 1 11 80
mov rax, QWORD PTR -16[rbp]
cvtsi2sd xmm1, rax
.loc 1 11 78
movsd xmm0, QWORD PTR PI$[rip]
mulsd xmm1, xmm0
.loc 1 11 93
movsd xmm0, QWORD PTR .LC4[rip]
mulsd xmm0, xmm1
.loc 1 11 56
movsd xmm1, QWORD PTR .LC0[rip]
divsd xmm0, xmm1
call cos@PLT
movapd xmm1, xmm0
.loc 1 11 118
movsd xmm0, QWORD PTR .LC1[rip]
mulsd xmm1, xmm0
.loc 1 11 130
movsd xmm0, QWORD PTR .LC3[rip]
addsd xmm0, xmm1
.loc 1 11 4
cvtsd2ss xmm0, xmm0
mov r8d, -2147483646
mov ecx, 65535
mov edx, 0
mov esi, 0
movss xmm3, DWORD PTR -40[rbp]
movaps xmm2, xmm0
pxor xmm1, xmm1
pxor xmm0, xmm0
mov edi, 0
call fb_GfxLine@PLT
.L4:
.loc 1 12 13
mov rax, QWORD PTR -16[rbp]
add rax, 1
.loc 1 12 7
mov QWORD PTR -16[rbp], rax
.L5:
.loc 1 12 11
mov rax, QWORD PTR -16[rbp]
.loc 1 12 5
cmp rax, 5
jg .L10
.loc 1 12 20 discriminator 2
jmp .L3
.L10:
.loc 1 12 3
nop
.L6:
.loc 1 14 2
mov edi, -1
call fb_Sleep@PLT
.loc 1 15 2
mov edi, 0
call fb_End@PLT
.L7:
.loc 1 15 2
mov edi, 0
call fb_End@PLT
.loc 1 15 9
mov eax, DWORD PTR -20[rbp]
.loc 1 15 1
mov rdx, QWORD PTR -8[rbp]
xor rdx, QWORD PTR fs:40
je .L9
call __stack_chk_fail@PLT
.L9:
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc

It can also output a C source file, it is a sort of 'low level C' that allows freebasic to be multiplatform, though im not an expert on freebasic internals, quite interesting though
Code: [Select]
typedef   signed char       int8;
typedef unsigned char      uint8;
typedef   signed short      int16;
typedef unsigned short     uint16;
typedef   signed int        int32;
typedef unsigned int       uint32;
typedef   signed long long  int64;
typedef unsigned long long uint64;
typedef struct { char *data; int64 len; int64 size; } FBSTRING;
typedef int8 boolean;
#line 15 "star.bas"
void fb_GfxPset( void*, float, float, uint32, int32, int32 );
#line 15 "star.bas"
void fb_GfxLine( void*, float, float, float, float, uint32, int32, uint32, int32 );
#line 15 "star.bas"
int32 fb_GfxScreenRes( int32, int32, int32, int32, int32, int32 );
#line 15 "star.bas"
void fb_Init( int32, uint8**, int32 );
#line 15 "star.bas"
void fb_End( int32 );
#line 15 "star.bas"
void fb_End( int32 );
#line 15 "star.bas"
void fb_Sleep( int32 );
#line 15 "star.bas"
static double PI$ = 0x1.921FB54442D18p+1;

#line 1 "star.bas"
int32 main( int32 __FB_ARGC__$0, char** __FB_ARGV__$0 )
#line 1 "star.bas"
{
#line 1 "star.bas"
int32 fb$result$0;
#line 1 "star.bas"
__builtin_memset( &fb$result$0, 0, 4ll );
#line 1 "star.bas"
int64 I$0;
#line 1 "star.bas"
__builtin_memset( &I$0, 0, 8ll );
#line 1 "star.bas"
fb_Init( __FB_ARGC__$0, (uint8**)__FB_ARGV__$0, 2 );
#line 1 "star.bas"
label$0:;
// #lang "fblite"
// const sw = 800
// const sh = 600
// dim shared as double pi = 2*asin(1)
// screenres sw, sh, 32
#line 6 "star.bas"
fb_GfxScreenRes( 800, 600, 32, 1, 0, 0 );
// i=0
#line 8 "star.bas"
I$0 = 0ll;
// pset (200*cos(2*pi*i/5) + sw/2, sh/2 - 200*sin(2*pi*i/5))
#line 9 "star.bas"
fb_GfxPset( (void*)0ull, (float)((__builtin_cos( (((PI$ * (double)I$0) * 0x1.p+1) / 0x1.4p+2) ) * 0x1.9p+7) + 0x1.9p+8), (float)(-(__builtin_sin( (((PI$ * (double)I$0) * 0x1.p+1) / 0x1.4p+2) ) * 0x1.9p+7) + 0x1.2Cp+8), 0u, -2147483644, 0 );
// for i=0 to 5
{
#line 10 "star.bas"
I$0 = 0ll;
#line 10 "star.bas"
label$5:;
{
// line -(200*cos(2*pi*i*2/5) + sw/2, sh/2 - 200*sin(2*pi*i*2/5))
#line 11 "star.bas"
fb_GfxLine( (void*)0ull, 0x0p+0f, 0x0p+0f, (float)((__builtin_cos( (((PI$ * (double)I$0) * 0x1.p+2) / 0x1.4p+2) ) * 0x1.9p+7) + 0x1.9p+8), (float)(-(__builtin_sin( (((PI$ * (double)I$0) * 0x1.p+2) / 0x1.4p+2) ) * 0x1.9p+7) + 0x1.2Cp+8), 0u, 0, 65535u, -2147483646 );
// next
}
#line 12 "star.bas"
label$3:;
#line 12 "star.bas"
I$0 = I$0 + 1ll;
#line 12 "star.bas"
label$2:;
#line 12 "star.bas"
if( I$0 <= 5ll ) goto label$5;
#line 12 "star.bas"
label$4:;
}
// sleep
#line 14 "star.bas"
fb_Sleep( -1 );
// system
#line 15 "star.bas"
fb_End( 0 );
#line 15 "star.bas"
label$1:;
#line 15 "star.bas"
fb_End( 0 );
#line 15 "star.bas"
return fb$result$0;
#line 15 "star.bas"
}

For 99.9% of the use cases out there you are 100% correct. There is no longer a need for handwritten assembler. However, there are special cases where handwritten assembler is a must. I gave one example in my code example above. In chess programming for the last 12 years give or take magic bitboards in chess move generation has been untouchable for speed. It is just 7 machine language instructions for a rook or a bishop and 14 (R + B) for the queen. That would forever be unbeatable if it were not for two (three for intel) facts. The 7 instructions form a dependency chain that makes the code not dual execution pipe friendly and it has one imul instruction that is slightly more expensive even on today's processors. And the third for intel is that intel cpus have slow shift instructions. And magic uses several shifts. Now my handwritten assembler for my new bitboard approach has 20 instructions but they are split between two dependency chains that have zero cross dependencies and thus will run in both pipes as though they were only 10 instructions running in one pipe. And overall they are faster instructions with no imul and only one shift. And that is why in special cases handwritten assembler can still be superior when it counts! :)
My name is Michael, but you can call me Mike :)