Author Topic: How is a QB64 exe file made? (Read 12056 times)

_vince · « **Reply #15 on:** February 24, 2020, 08:18:05 pm »

Quote from: romichess on February 24, 2020, 03:58:45 pm

I like the idea of a basic that outputs assembler. That way after having a working program I could rewrite sections one at a time for better performance.

freebasic can output the full asm file right before it gets assembled, but it also has inline assembly support so you'd never care to modify it anyway (Not that you should ever try to hand optimize modern asm)

Here's an excerpt:

Code: [Select]

const sw = 800
const sh = 600
dim shared as double pi = 2*asin(1)

screenres sw, sh, 32

i=0
pset (200*cos(2*pi*i/5) + sw/2, sh/2 - 200*sin(2*pi*i/5))
for i=0 to 5
	line -(200*cos(2*pi*i*2/5) + sw/2, sh/2 - 200*sin(2*pi*i*2/5))
next

sleep
system

excerpt from the .asm file

Code: [Select]

main:
.LFB0:
	.file 1 "star.bas"
	.loc 1 1 1
	.cfi_startproc
	push	rbp
	.cfi_def_cfa_offset 16
	.cfi_offset 6, -16
	mov	rbp, rsp
	.cfi_def_cfa_register 6
	sub	rsp, 48
	mov	DWORD PTR -36[rbp], edi
	mov	QWORD PTR -48[rbp], rsi
	.loc 1 1 1
	mov	rax, QWORD PTR fs:40
	mov	QWORD PTR -8[rbp], rax
	xor	eax, eax
	.loc 1 1 2
	mov	DWORD PTR -20[rbp], 0
	.loc 1 1 2
	mov	QWORD PTR -16[rbp], 0
	.loc 1 1 2
	mov	rcx, QWORD PTR -48[rbp]
	mov	eax, DWORD PTR -36[rbp]
	mov	edx, 2
	mov	rsi, rcx
	mov	edi, eax
	call	fb_Init@PLT
.L2:
	.loc 1 6 2
	mov	r9d, 0
	mov	r8d, 0
	mov	ecx, 1
	mov	edx, 32
	mov	esi, 600
	mov	edi, 800
	call	fb_GfxScreenRes@PLT
	.loc 1 8 6
	mov	QWORD PTR -16[rbp], 0
	.loc 1 9 157
	mov	rax, QWORD PTR -16[rbp]
	cvtsi2sd	xmm1, rax
	.loc 1 9 155
	movsd	xmm0, QWORD PTR PI$[rip]
	mulsd	xmm0, xmm1
	.loc 1 9 170
	addsd	xmm0, xmm0
	.loc 1 9 133
	movsd	xmm1, QWORD PTR .LC0[rip]
	divsd	xmm0, xmm1
	call	sin@PLT
	movapd	xmm1, xmm0
	.loc 1 9 195
	movsd	xmm0, QWORD PTR .LC1[rip]
	mulsd	xmm1, xmm0
	.loc 1 9 207
	movsd	xmm0, QWORD PTR .LC2[rip]
	subsd	xmm0, xmm1
	.loc 1 9 2
	cvtsd2ss	xmm4, xmm0
	movss	DWORD PTR -40[rbp], xmm4
	.loc 1 9 60
	mov	rax, QWORD PTR -16[rbp]
	cvtsi2sd	xmm1, rax
	.loc 1 9 58
	movsd	xmm0, QWORD PTR PI$[rip]
	mulsd	xmm0, xmm1
	.loc 1 9 73
	addsd	xmm0, xmm0
	.loc 1 9 36
	movsd	xmm1, QWORD PTR .LC0[rip]
	divsd	xmm0, xmm1
	call	cos@PLT
	movapd	xmm1, xmm0
	.loc 1 9 98
	movsd	xmm0, QWORD PTR .LC1[rip]
	mulsd	xmm1, xmm0
	.loc 1 9 110
	movsd	xmm0, QWORD PTR .LC3[rip]
	addsd	xmm0, xmm1
	.loc 1 9 2
	cvtsd2ss	xmm0, xmm0
	mov	ecx, 0
	mov	edx, -2147483644
	mov	esi, 0
	movss	xmm1, DWORD PTR -40[rbp]
	mov	edi, 0
	call	fb_GfxPset@PLT
	.loc 1 10 7
	mov	QWORD PTR -16[rbp], 0
.L3:
	.loc 1 11 177
	mov	rax, QWORD PTR -16[rbp]
	cvtsi2sd	xmm1, rax
	.loc 1 11 175
	movsd	xmm0, QWORD PTR PI$[rip]
	mulsd	xmm1, xmm0
	.loc 1 11 190
	movsd	xmm0, QWORD PTR .LC4[rip]
	mulsd	xmm0, xmm1
	.loc 1 11 153
	movsd	xmm1, QWORD PTR .LC0[rip]
	divsd	xmm0, xmm1
	call	sin@PLT
	movapd	xmm1, xmm0
	.loc 1 11 215
	movsd	xmm0, QWORD PTR .LC1[rip]
	mulsd	xmm1, xmm0
	.loc 1 11 227
	movsd	xmm0, QWORD PTR .LC2[rip]
	subsd	xmm0, xmm1
	.loc 1 11 4
	cvtsd2ss	xmm5, xmm0
	movss	DWORD PTR -40[rbp], xmm5
	.loc 1 11 80
	mov	rax, QWORD PTR -16[rbp]
	cvtsi2sd	xmm1, rax
	.loc 1 11 78
	movsd	xmm0, QWORD PTR PI$[rip]
	mulsd	xmm1, xmm0
	.loc 1 11 93
	movsd	xmm0, QWORD PTR .LC4[rip]
	mulsd	xmm0, xmm1
	.loc 1 11 56
	movsd	xmm1, QWORD PTR .LC0[rip]
	divsd	xmm0, xmm1
	call	cos@PLT
	movapd	xmm1, xmm0
	.loc 1 11 118
	movsd	xmm0, QWORD PTR .LC1[rip]
	mulsd	xmm1, xmm0
	.loc 1 11 130
	movsd	xmm0, QWORD PTR .LC3[rip]
	addsd	xmm0, xmm1
	.loc 1 11 4
	cvtsd2ss	xmm0, xmm0
	mov	r8d, -2147483646
	mov	ecx, 65535
	mov	edx, 0
	mov	esi, 0
	movss	xmm3, DWORD PTR -40[rbp]
	movaps	xmm2, xmm0
	pxor	xmm1, xmm1
	pxor	xmm0, xmm0
	mov	edi, 0
	call	fb_GfxLine@PLT
.L4:
	.loc 1 12 13
	mov	rax, QWORD PTR -16[rbp]
	add	rax, 1
	.loc 1 12 7
	mov	QWORD PTR -16[rbp], rax
.L5:
	.loc 1 12 11
	mov	rax, QWORD PTR -16[rbp]
	.loc 1 12 5
	cmp	rax, 5
	jg	.L10
	.loc 1 12 20 discriminator 2
	jmp	.L3
.L10:
	.loc 1 12 3
	nop
.L6:
	.loc 1 14 2
	mov	edi, -1
	call	fb_Sleep@PLT
	.loc 1 15 2
	mov	edi, 0
	call	fb_End@PLT
.L7:
	.loc 1 15 2
	mov	edi, 0
	call	fb_End@PLT
	.loc 1 15 9
	mov	eax, DWORD PTR -20[rbp]
	.loc 1 15 1
	mov	rdx, QWORD PTR -8[rbp]
	xor	rdx, QWORD PTR fs:40
	je	.L9
	call	__stack_chk_fail@PLT
.L9:
	leave
	.cfi_def_cfa 7, 8
	ret
	.cfi_endproc

It can also output a C source file, it is a sort of 'low level C' that allows freebasic to be multiplatform, though im not an expert on freebasic internals, quite interesting though

Code: [Select]

typedef   signed char       int8;
typedef unsigned char      uint8;
typedef   signed short      int16;
typedef unsigned short     uint16;
typedef   signed int        int32;
typedef unsigned int       uint32;
typedef   signed long long  int64;
typedef unsigned long long uint64;
typedef struct { char *data; int64 len; int64 size; } FBSTRING;
typedef int8 boolean;
#line 15 "star.bas"
void fb_GfxPset( void*, float, float, uint32, int32, int32 );
#line 15 "star.bas"
void fb_GfxLine( void*, float, float, float, float, uint32, int32, uint32, int32 );
#line 15 "star.bas"
int32 fb_GfxScreenRes( int32, int32, int32, int32, int32, int32 );
#line 15 "star.bas"
void fb_Init( int32, uint8**, int32 );
#line 15 "star.bas"
void fb_End( int32 );
#line 15 "star.bas"
void fb_End( int32 );
#line 15 "star.bas"
void fb_Sleep( int32 );
#line 15 "star.bas"
static double PI$ = 0x1.921FB54442D18p+1;

#line 1 "star.bas"
int32 main( int32 __FB_ARGC__$0, char** __FB_ARGV__$0 )
#line 1 "star.bas"
{
	#line 1 "star.bas"
	int32 fb$result$0;
	#line 1 "star.bas"
	__builtin_memset( &fb$result$0, 0, 4ll );
	#line 1 "star.bas"
	int64 I$0;
	#line 1 "star.bas"
	__builtin_memset( &I$0, 0, 8ll );
	#line 1 "star.bas"
	fb_Init( __FB_ARGC__$0, (uint8**)__FB_ARGV__$0, 2 );
	#line 1 "star.bas"
	label$0:;
	// #lang "fblite"
	// const sw = 800
	// const sh = 600
	// dim shared as double pi = 2*asin(1)
	// screenres sw, sh, 32
	#line 6 "star.bas"
	fb_GfxScreenRes( 800, 600, 32, 1, 0, 0 );
	// i=0
	#line 8 "star.bas"
	I$0 = 0ll;
	// pset (200*cos(2*pi*i/5) + sw/2, sh/2 - 200*sin(2*pi*i/5))
	#line 9 "star.bas"
	fb_GfxPset( (void*)0ull, (float)((__builtin_cos( (((PI$ * (double)I$0) * 0x1.p+1) / 0x1.4p+2) ) * 0x1.9p+7) + 0x1.9p+8), (float)(-(__builtin_sin( (((PI$ * (double)I$0) * 0x1.p+1) / 0x1.4p+2) ) * 0x1.9p+7) + 0x1.2Cp+8), 0u, -2147483644, 0 );
	// for i=0 to 5
	{
		#line 10 "star.bas"
		I$0 = 0ll;
		#line 10 "star.bas"
		label$5:;
		{
			// 	line -(200*cos(2*pi*i*2/5) + sw/2, sh/2 - 200*sin(2*pi*i*2/5))
			#line 11 "star.bas"
			fb_GfxLine( (void*)0ull, 0x0p+0f, 0x0p+0f, (float)((__builtin_cos( (((PI$ * (double)I$0) * 0x1.p+2) / 0x1.4p+2) ) * 0x1.9p+7) + 0x1.9p+8), (float)(-(__builtin_sin( (((PI$ * (double)I$0) * 0x1.p+2) / 0x1.4p+2) ) * 0x1.9p+7) + 0x1.2Cp+8), 0u, 0, 65535u, -2147483646 );
			// next
		}
		#line 12 "star.bas"
		label$3:;
		#line 12 "star.bas"
		I$0 = I$0 + 1ll;
		#line 12 "star.bas"
		label$2:;
		#line 12 "star.bas"
		if( I$0 <= 5ll ) goto label$5;
		#line 12 "star.bas"
		label$4:;
	}
	// sleep
	#line 14 "star.bas"
	fb_Sleep( -1 );
	// system
	#line 15 "star.bas"
	fb_End( 0 );
	#line 15 "star.bas"
	label$1:;
	#line 15 "star.bas"
	fb_End( 0 );
	#line 15 "star.bas"
	return fb$result$0;
#line 15 "star.bas"
}

romichess · « **Reply #16 on:** February 24, 2020, 09:26:11 pm »

Quote from: _vince on February 24, 2020, 08:18:05 pm

freebasic can output the full asm file right before it gets assembled, but it also has inline assembly support so you'd never care to modify it anyway (Not that you should ever try to hand optimize modern asm)

Here's an excerpt:
Code: [Select]
const sw = 800 const sh = 600 dim shared as double pi = 2*asin(1) screenres sw, sh, 32 i=0 pset (200*cos(2*pi*i/5) + sw/2, sh/2 - 200*sin(2*pi*i/5)) for i=0 to 5 line -(200*cos(2*pi*i*2/5) + sw/2, sh/2 - 200*sin(2*pi*i*2/5)) next sleep system
excerpt from the .asm file
Code: [Select]
main: .LFB0: .file 1 "star.bas" .loc 1 1 1 .cfi_startproc push rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 mov rbp, rsp .cfi_def_cfa_register 6 sub rsp, 48 mov DWORD PTR -36[rbp], edi mov QWORD PTR -48[rbp], rsi .loc 1 1 1 mov rax, QWORD PTR fs:40 mov QWORD PTR -8[rbp], rax xor eax, eax .loc 1 1 2 mov DWORD PTR -20[rbp], 0 .loc 1 1 2 mov QWORD PTR -16[rbp], 0 .loc 1 1 2 mov rcx, QWORD PTR -48[rbp] mov eax, DWORD PTR -36[rbp] mov edx, 2 mov rsi, rcx mov edi, eax call fb_Init@PLT .L2: .loc 1 6 2 mov r9d, 0 mov r8d, 0 mov ecx, 1 mov edx, 32 mov esi, 600 mov edi, 800 call fb_GfxScreenRes@PLT .loc 1 8 6 mov QWORD PTR -16[rbp], 0 .loc 1 9 157 mov rax, QWORD PTR -16[rbp] cvtsi2sd xmm1, rax .loc 1 9 155 movsd xmm0, QWORD PTR PI$[rip] mulsd xmm0, xmm1 .loc 1 9 170 addsd xmm0, xmm0 .loc 1 9 133 movsd xmm1, QWORD PTR .LC0[rip] divsd xmm0, xmm1 call sin@PLT movapd xmm1, xmm0 .loc 1 9 195 movsd xmm0, QWORD PTR .LC1[rip] mulsd xmm1, xmm0 .loc 1 9 207 movsd xmm0, QWORD PTR .LC2[rip] subsd xmm0, xmm1 .loc 1 9 2 cvtsd2ss xmm4, xmm0 movss DWORD PTR -40[rbp], xmm4 .loc 1 9 60 mov rax, QWORD PTR -16[rbp] cvtsi2sd xmm1, rax .loc 1 9 58 movsd xmm0, QWORD PTR PI$[rip] mulsd xmm0, xmm1 .loc 1 9 73 addsd xmm0, xmm0 .loc 1 9 36 movsd xmm1, QWORD PTR .LC0[rip] divsd xmm0, xmm1 call cos@PLT movapd xmm1, xmm0 .loc 1 9 98 movsd xmm0, QWORD PTR .LC1[rip] mulsd xmm1, xmm0 .loc 1 9 110 movsd xmm0, QWORD PTR .LC3[rip] addsd xmm0, xmm1 .loc 1 9 2 cvtsd2ss xmm0, xmm0 mov ecx, 0 mov edx, -2147483644 mov esi, 0 movss xmm1, DWORD PTR -40[rbp] mov edi, 0 call fb_GfxPset@PLT .loc 1 10 7 mov QWORD PTR -16[rbp], 0 .L3: .loc 1 11 177 mov rax, QWORD PTR -16[rbp] cvtsi2sd xmm1, rax .loc 1 11 175 movsd xmm0, QWORD PTR PI$[rip] mulsd xmm1, xmm0 .loc 1 11 190 movsd xmm0, QWORD PTR .LC4[rip] mulsd xmm0, xmm1 .loc 1 11 153 movsd xmm1, QWORD PTR .LC0[rip] divsd xmm0, xmm1 call sin@PLT movapd xmm1, xmm0 .loc 1 11 215 movsd xmm0, QWORD PTR .LC1[rip] mulsd xmm1, xmm0 .loc 1 11 227 movsd xmm0, QWORD PTR .LC2[rip] subsd xmm0, xmm1 .loc 1 11 4 cvtsd2ss xmm5, xmm0 movss DWORD PTR -40[rbp], xmm5 .loc 1 11 80 mov rax, QWORD PTR -16[rbp] cvtsi2sd xmm1, rax .loc 1 11 78 movsd xmm0, QWORD PTR PI$[rip] mulsd xmm1, xmm0 .loc 1 11 93 movsd xmm0, QWORD PTR .LC4[rip] mulsd xmm0, xmm1 .loc 1 11 56 movsd xmm1, QWORD PTR .LC0[rip] divsd xmm0, xmm1 call cos@PLT movapd xmm1, xmm0 .loc 1 11 118 movsd xmm0, QWORD PTR .LC1[rip] mulsd xmm1, xmm0 .loc 1 11 130 movsd xmm0, QWORD PTR .LC3[rip] addsd xmm0, xmm1 .loc 1 11 4 cvtsd2ss xmm0, xmm0 mov r8d, -2147483646 mov ecx, 65535 mov edx, 0 mov esi, 0 movss xmm3, DWORD PTR -40[rbp] movaps xmm2, xmm0 pxor xmm1, xmm1 pxor xmm0, xmm0 mov edi, 0 call fb_GfxLine@PLT .L4: .loc 1 12 13 mov rax, QWORD PTR -16[rbp] add rax, 1 .loc 1 12 7 mov QWORD PTR -16[rbp], rax .L5: .loc 1 12 11 mov rax, QWORD PTR -16[rbp] .loc 1 12 5 cmp rax, 5 jg .L10 .loc 1 12 20 discriminator 2 jmp .L3 .L10: .loc 1 12 3 nop .L6: .loc 1 14 2 mov edi, -1 call fb_Sleep@PLT .loc 1 15 2 mov edi, 0 call fb_End@PLT .L7: .loc 1 15 2 mov edi, 0 call fb_End@PLT .loc 1 15 9 mov eax, DWORD PTR -20[rbp] .loc 1 15 1 mov rdx, QWORD PTR -8[rbp] xor rdx, QWORD PTR fs:40 je .L9 call __stack_chk_fail@PLT .L9: leave .cfi_def_cfa 7, 8 ret .cfi_endproc
It can also output a C source file, it is a sort of 'low level C' that allows freebasic to be multiplatform, though im not an expert on freebasic internals, quite interesting though
Code: [Select]
typedef signed char int8; typedef unsigned char uint8; typedef signed short int16; typedef unsigned short uint16; typedef signed int int32; typedef unsigned int uint32; typedef signed long long int64; typedef unsigned long long uint64; typedef struct { char *data; int64 len; int64 size; } FBSTRING; typedef int8 boolean; #line 15 "star.bas" void fb_GfxPset( void*, float, float, uint32, int32, int32 ); #line 15 "star.bas" void fb_GfxLine( void*, float, float, float, float, uint32, int32, uint32, int32 ); #line 15 "star.bas" int32 fb_GfxScreenRes( int32, int32, int32, int32, int32, int32 ); #line 15 "star.bas" void fb_Init( int32, uint8**, int32 ); #line 15 "star.bas" void fb_End( int32 ); #line 15 "star.bas" void fb_End( int32 ); #line 15 "star.bas" void fb_Sleep( int32 ); #line 15 "star.bas" static double PI$ = 0x1.921FB54442D18p+1; #line 1 "star.bas" int32 main( int32 __FB_ARGC__$0, char** __FB_ARGV__$0 ) #line 1 "star.bas" { #line 1 "star.bas" int32 fb$result$0; #line 1 "star.bas" __builtin_memset( &fb$result$0, 0, 4ll ); #line 1 "star.bas" int64 I$0; #line 1 "star.bas" __builtin_memset( &I$0, 0, 8ll ); #line 1 "star.bas" fb_Init( __FB_ARGC__$0, (uint8**)__FB_ARGV__$0, 2 ); #line 1 "star.bas" label$0:; // #lang "fblite" // const sw = 800 // const sh = 600 // dim shared as double pi = 2*asin(1) // screenres sw, sh, 32 #line 6 "star.bas" fb_GfxScreenRes( 800, 600, 32, 1, 0, 0 ); // i=0 #line 8 "star.bas" I$0 = 0ll; // pset (200*cos(2*pi*i/5) + sw/2, sh/2 - 200*sin(2*pi*i/5)) #line 9 "star.bas" fb_GfxPset( (void*)0ull, (float)((__builtin_cos( (((PI$ * (double)I$0) * 0x1.p+1) / 0x1.4p+2) ) * 0x1.9p+7) + 0x1.9p+8), (float)(-(__builtin_sin( (((PI$ * (double)I$0) * 0x1.p+1) / 0x1.4p+2) ) * 0x1.9p+7) + 0x1.2Cp+8), 0u, -2147483644, 0 ); // for i=0 to 5 { #line 10 "star.bas" I$0 = 0ll; #line 10 "star.bas" label$5:; { // line -(200*cos(2*pi*i*2/5) + sw/2, sh/2 - 200*sin(2*pi*i*2/5)) #line 11 "star.bas" fb_GfxLine( (void*)0ull, 0x0p+0f, 0x0p+0f, (float)((__builtin_cos( (((PI$ * (double)I$0) * 0x1.p+2) / 0x1.4p+2) ) * 0x1.9p+7) + 0x1.9p+8), (float)(-(__builtin_sin( (((PI$ * (double)I$0) * 0x1.p+2) / 0x1.4p+2) ) * 0x1.9p+7) + 0x1.2Cp+8), 0u, 0, 65535u, -2147483646 ); // next } #line 12 "star.bas" label$3:; #line 12 "star.bas" I$0 = I$0 + 1ll; #line 12 "star.bas" label$2:; #line 12 "star.bas" if( I$0 <= 5ll ) goto label$5; #line 12 "star.bas" label$4:; } // sleep #line 14 "star.bas" fb_Sleep( -1 ); // system #line 15 "star.bas" fb_End( 0 ); #line 15 "star.bas" label$1:; #line 15 "star.bas" fb_End( 0 ); #line 15 "star.bas" return fb$result$0; #line 15 "star.bas" }

For 99.9% of the use cases out there you are 100% correct. There is no longer a need for handwritten assembler. However, there are special cases where handwritten assembler is a must. I gave one example in my code example above. In chess programming for the last 12 years give or take magic bitboards in chess move generation has been untouchable for speed. It is just 7 machine language instructions for a rook or a bishop and 14 (R + B) for the queen. That would forever be unbeatable if it were not for two (three for intel) facts. The 7 instructions form a dependency chain that makes the code not dual execution pipe friendly and it has one imul instruction that is slightly more expensive even on today's processors. And the third for intel is that intel cpus have slow shift instructions. And magic uses several shifts. Now my handwritten assembler for my new bitboard approach has 20 instructions but they are split between two dependency chains that have zero cross dependencies and thus will run in both pipes as though they were only 10 instructions running in one pipe. And overall they are faster instructions with no imul and only one shift. And that is why in special cases handwritten assembler can still be superior when it counts! :)

News:

Author Topic: How is a QB64 exe file made? (Read 12056 times)

_vince

Re: How is a QB64 exe file made?

romichess

Re: How is a QB64 exe file made?