cc65 codegen

Are you new to 6502, NES, or even programming in general? Post any of your questions here. Remember - the only dumb question is the question that remains unasked.

Moderator: Moderators

Post Reply
ajb
Posts: 14
Joined: Thu Apr 08, 2021 5:51 am

cc65 codegen

Post by ajb »

Ok, I know 6502 is 8bit and register starved, and cc65 codegen is supposed to be far from optimal, but I'm still finding myself surprised at the perf.

Here's an example.

This was my original code:

Code: Select all

void game_draw_sprites()
{
  int i;
  for (i=0; i < g_num_active; ++i) {
    s16 x = g_x16_0[i]-3-g_scroll_x;
    s16 y = g_y16_0[i]-4-g_scroll_y;
    if ((x>=0) && (x < SCREEN_X_PIX) && (y>=0) && (y < SCREEN_Y_PIX))
      oam_spr(x, y, g_size[i], 0); 
  }
}
Mesen showed 3934 cycles for g_num_active=6, which is 655 cycles per iteration, with only 67 cycles in oam_spr.

I experimented a bit and got to this:

Code: Select all

void game_draw_sprites()
{
  u8 i;
  u16 x, y;
  for (i=0; i < g_num_active; ++i) {
    x = g_x16_0[i]-3-g_scroll_x;
    if (x >= SCREEN_X_PIX)
      continue;
    y = g_y16_0[i]-4-g_scroll_y;
    if (y >= SCREEN_Y_PIX)
      continue;
    oam_spr(x, y, g_size[i], 0); 
  }
}
which is 2518 cycles -> 419 cycles per iteration.

The code generated is:

Code: Select all

; ---------------------------------------------------------------
; void __near__ game_draw_sprites (void)
; ---------------------------------------------------------------

.segment	"CODE"

.proc	_game_draw_sprites: near

	.dbg	func, "game_draw_sprites", "00", extern, "_game_draw_sprites"
	.dbg	sym, "i", "00", auto, -1
	.dbg	sym, "x", "00", auto, -3
	.dbg	sym, "y", "00", auto, -5

.segment	"CODE"

;
; for (i=0; i < g_num_active; ++i) {
;
	.dbg	line, "..\test.c", 116
	jsr     decsp5
	lda     #$00
	ldy     #$04
L000D:	sta     (sp),y
	cmp     _g_num_active
	jcs     L0003
;
; x = g_x16_0[i]-3-g_scroll_x;
;
	.dbg	line, "..\test.c", 117
	ldx     #$00
	lda     (sp),y
	asl     a
	bcc     L000E
	inx
	clc
L000E:	adc     #<(_g_x16_0)
	sta     ptr1
	txa
	adc     #>(_g_x16_0)
	sta     ptr1+1
	ldy     #$01
	lda     (ptr1),y
	tax
	dey
	lda     (ptr1),y
	sec
	sbc     #$03
	bcs     L000F
	dex
	sec
L000F:	sbc     _g_scroll_x
	pha
	txa
	sbc     _g_scroll_x+1
	tax
	pla
	ldy     #$02
	jsr     staxysp
;
; if (x >= SCREEN_X_PIX) 
;
	.dbg	line, "..\test.c", 118
	cmp     #$00
	txa
	sbc     #$01
;
; continue;
;
	.dbg	line, "..\test.c", 119
	bcs     L0004
;
; y = g_y16_0[i]-4-g_scroll_y;
;
	.dbg	line, "..\test.c", 120
	ldy     #$04
	ldx     #$00
	lda     (sp),y
	asl     a
	bcc     L0010
	inx
	clc
L0010:	adc     #<(_g_y16_0)
	sta     ptr1
	txa
	adc     #>(_g_y16_0)
	sta     ptr1+1
	ldy     #$01
	lda     (ptr1),y
	tax
	dey
	lda     (ptr1),y
	sec
	sbc     #$04
	bcs     L0011
	dex
	sec
L0011:	sbc     _g_scroll_y
	pha
	txa
	sbc     _g_scroll_y+1
	tax
	pla
	jsr     stax0sp
;
; if (y >= SCREEN_Y_PIX)
;
	.dbg	line, "..\test.c", 121
	cmp     #$F0
	txa
	sbc     #$00
;
; continue;
;
	.dbg	line, "..\test.c", 122
	bcs     L0004
;
; oam_spr(x, y, g_size[i], 0);
;
	.dbg	line, "..\test.c", 123
	jsr     decsp3
	ldy     #$05
	lda     (sp),y
	ldy     #$02
	sta     (sp),y
	iny
	lda     (sp),y
	ldy     #$01
	sta     (sp),y
	ldy     #$07
	lda     (sp),y
	tay
	lda     _g_size,y
	ldy     #$00
	sta     (sp),y
	tya
	jsr     _oam_spr
;
; for (i=0; i < g_num_active; ++i) {
;
	.dbg	line, "..\test.c", 116
L0004:	ldy     #$04
	clc
	lda     #$01
	adc     (sp),y
	jmp     L000D
;
; }
;
	.dbg	line, "..\test.c", 125
L0003:	jmp     incsp5

	.dbg	line
.endproc
I'm still a 6502 beginner, so my takeaways so far: >8b is expensive and function calls are expensive.

But I'm lacking an understanding of:
1/ how much more can I optimize in c, and what is needed?
2/ if rewritten in straightforward asm, how much faster it would be?
3/ ditto for all-out optimal asm?

Would appreciate some wisdom here.
Thanks!
calima
Posts: 1745
Joined: Tue Oct 06, 2015 10:16 am

Re: cc65 codegen

Post by calima »

Follow the cc65 docs and wiki, they have lots of optimization tips. E.g. your code still uses stack variables.
lidnariq
Posts: 11430
Joined: Sun Apr 13, 2008 11:12 am

Re: cc65 codegen

Post by lidnariq »

Arrays of not-8-bit things are expensive too. The only way to get cc65 to emit the faster (lda abs,[xy]) instructions is when accessing arrays of char.
User avatar
dougeff
Posts: 3078
Joined: Fri May 08, 2015 7:17 pm

Re: cc65 codegen

Post by dougeff »

I feel this is "good enough" and not worth optimizing.
nesdoug.com -- blog/tutorial on programming for the NES
ajb
Posts: 14
Joined: Thu Apr 08, 2021 5:51 am

Re: cc65 codegen

Post by ajb »

Thanks for the feedback!

I reviewed https://cc65.github.io/doc/coding.html and found this https://github.com/ilmenit/CC65-Advanced-Optimizations which suggested a few things.

Optimizing from 2518 cycles:
2175 <- making i,x,y register vars
2033 <- #pragma static-locals
1965 <- putting them manually in zero page

generated code for 1965:

Code: Select all

; ---------------------------------------------------------------
; void __near__ game_draw_sprites (void)
; ---------------------------------------------------------------

.segment	"CODE"

.proc	_game_draw_sprites: near

	.dbg	func, "game_draw_sprites", "00", extern, "_game_draw_sprites"

.segment	"CODE"

;
; for (i=0; i < g_num_active; ++i) {
;
	.dbg	line, "..\test.c", 122
	lda     #$00
	sta     _i
L0011:	lda     _i
	cmp     _g_num_active
	bcc     L0013
;
; }
;
	.dbg	line, "..\test.c", 131
	rts
;
; x = g_x16_0[i]-3-g_scroll_x;
;
	.dbg	line, "..\test.c", 123
L0013:	ldx     #$00
	lda     _i
	asl     a
	bcc     L000D
	inx
	clc
L000D:	adc     #<(_g_x16_0)
	sta     ptr1
	txa
	adc     #>(_g_x16_0)
	sta     ptr1+1
	ldy     #$01
	lda     (ptr1),y
	tax
	dey
	lda     (ptr1),y
	sec
	sbc     #$03
	bcs     L000E
	dex
	sec
L000E:	sbc     _g_scroll_x
	pha
	txa
	sbc     _g_scroll_x+1
	tax
	pla
	sta     _x
	stx     _x+1
;
; if (x >= SCREEN_X_PIX)
;
	.dbg	line, "..\test.c", 124
	cmp     #$00
	txa
	sbc     #$01
;
; continue;
;
	.dbg	line, "..\test.c", 125
	bcs     L0012
;
; y = g_y16_0[i]-4-g_scroll_y;
;
	.dbg	line, "..\test.c", 126
	ldx     #$00
	lda     _i
	asl     a
	bcc     L000F
	inx
	clc
L000F:	adc     #<(_g_y16_0)
	sta     ptr1
	txa
	adc     #>(_g_y16_0)
	sta     ptr1+1
	iny
	lda     (ptr1),y
	tax
	dey
	lda     (ptr1),y
	sec
	sbc     #$04
	bcs     L0010
	dex
	sec
L0010:	sbc     _g_scroll_y
	pha
	txa
	sbc     _g_scroll_y+1
	tax
	pla
	sta     _y
	stx     _y+1
;
; if (y >= SCREEN_Y_PIX)
;
	.dbg	line, "..\test.c", 127
	cmp     #$F0
	txa
	sbc     #$00
;
; continue;
;
	.dbg	line, "..\test.c", 128
	bcs     L0012
;
; oam_spr(x, y, g_size[i], 0);
;
	.dbg	line, "..\test.c", 129
	jsr     decsp3
	lda     _x
	ldy     #$02
	sta     (sp),y
	lda     _y
	dey
	sta     (sp),y
	ldy     _i
	lda     _g_size,y
	ldy     #$00
	sta     (sp),y
	tya
	jsr     _oam_spr
;
; for (i=0; i < g_num_active; ++i) {
;
	.dbg	line, "..\test.c", 122
L0012:	inc     _i
	jmp     L0011

	.dbg	line
.endproc
Arrays of not-8-bit things are expensive too.
Is there anything to be done here?
I feel this is "good enough" and not worth optimizing.
Ok, this is the sort of wisdom I'm looking for!

So ~325/sprite totally reasonable?

Any SWAGs on cycles if I rewrote in asm?
User avatar
tokumaru
Posts: 12427
Joined: Sat Feb 12, 2005 9:43 pm
Location: Rio de Janeiro - Brazil

Re: cc65 codegen

Post by tokumaru »

ajb wrote: Sat May 08, 2021 12:10 pm
Arrays of not-8-bit things are expensive too.
Is there anything to be done here?
Split arrays of non-8-bit things into multiple arrays of 8-bit things.
lidnariq
Posts: 11430
Joined: Sun Apr 13, 2008 11:12 am

Re: cc65 codegen

Post by lidnariq »

ajb wrote: Sat May 08, 2021 12:10 pm So ~325/sprite totally reasonable?
I mean, the right answer is "don't worry about performance until you actually start running into performance problems". Then you can start worrying about it.

CC65 knows that if you have an array of bytes, and you index it with a byte, you can use the fast instructions.

Because the 6502's fast instructions only can index through 256 bytes, cc65 has to do obnoxious pointer math on anything that could possibly index through more. So if you're dealing with other than an array of no more than 256 bytes, cc65 often produces better code with walking pointers that have been declared to be in zero page (pragma zpsym).
ajb
Posts: 14
Joined: Thu Apr 08, 2021 5:51 am

Re: cc65 codegen

Post by ajb »

Heh, well I've already hit those perf problems, so that's why I'm worrying :)

Of course, my physics is much more expensive than this draw code, but I figured I'd start with something more simple to get into cc65/6502 optimizing before tackling more complex code.

ok, I tried splitting the 16b array into two 8b arrays. This takes it from 1923 cycles to 1630, so not insignificant. It is however crossing a threshold in code complexity cost, though could apply a macro band aide. Moving the arrays to zero page didn't change the perf at all.

Code: Select all


; void __near__ game_draw_sprites (void)
; ---------------------------------------------------------------

.segment	"CODE"

.proc	_game_draw_sprites: near

	.dbg	func, "game_draw_sprites", "00", extern, "_game_draw_sprites"

.segment	"CODE"

;
; bx = g_scroll_x+3;
;
	.dbg	line, "..\test.c", 130
	lda     _g_scroll_x
	ldx     _g_scroll_x+1
	clc
	adc     #$03
	bcc     L0002
	inx
L0002:	sta     _bx
	stx     _bx+1
;
; by = g_scroll_y+4;
;
	.dbg	line, "..\test.c", 131
	lda     _g_scroll_y
	ldx     _g_scroll_y+1
	clc
	adc     #$04
	bcc     L0003
	inx
L0003:	sta     _by
	stx     _by+1
;
; for (i=0; i < g_num_active; ++i) {
;
	.dbg	line, "..\test.c", 132
	lda     #$00
	sta     _i
L000F:	lda     _i
	cmp     _g_num_active
	bcs     L0005
;
; x = (g_x_l[i] | (g_x_h[i]<<8)) -bx;
;
	.dbg	line, "..\test.c", 133
	ldy     _i
	ldx     _g_x_h,y
	ldy     _i
	lda     _g_x_l,y
	sec
	sbc     _bx
	pha
	txa
	sbc     _bx+1
	tax
	pla
	sta     _x
	stx     _x+1
;
; if (x >= SCREEN_X_PIX)
;
	.dbg	line, "..\test.c", 134
	cmp     #$00
	txa
	sbc     #$01
;
; continue;
;
	.dbg	line, "..\test.c", 135
	bcs     L0010
;
; y = (g_y_l[i] | (g_y_h[i]<<8)) -by;
;
	.dbg	line, "..\test.c", 136
	ldy     _i
	ldx     _g_y_h,y
	ldy     _i
	lda     _g_y_l,y
	sec
	sbc     _by
	pha
	txa
	sbc     _by+1
	tax
	pla
	sta     _y
	stx     _y+1
;
; if (y >= SCREEN_Y_PIX)
;
	.dbg	line, "..\test.c", 137
	cmp     #$F0
	txa
	sbc     #$00
;
; continue;
;
	.dbg	line, "..\test.c", 138
	bcs     L0010
;
; oam_spr(x, y, g_size[i], 0);
;
	.dbg	line, "..\test.c", 139
	jsr     decsp3
	lda     _x
	ldy     #$02
	sta     (sp),y
	lda     _y
	dey
	sta     (sp),y
	ldy     _i
	lda     _g_size,y
	ldy     #$00
	sta     (sp),y
	tya
	jsr     _oam_spr
;
; for (i=0; i < g_num_active; ++i) {
;
	.dbg	line, "..\test.c", 132
L0010:	inc     _i
	jmp     L000F
;
; }
;
	.dbg	line, "..\test.c", 141
L0005:	rts

	.dbg	line
.endproc
CC65 knows that if you have an array of bytes, and you index it with a byte, you can use the fast instructions.
Is this hitting the fast indexing path now? What do I look for?
lidnariq
Posts: 11430
Joined: Sun Apr 13, 2008 11:12 am

Re: cc65 codegen

Post by lidnariq »

ajb wrote: Sat May 08, 2021 4:26 pm

Code: Select all

; x = (g_x_l[i] | (g_x_h[i]<<8)) -bx;
;
	.dbg	line, "..\test.c", 133
	ldy     _i
	ldx     _g_x_h,y
	ldy     _i
	lda     _g_x_l,y
[...]Is this hitting the fast indexing path now? What do I look for?
What you have above is the "fast" form: ld_ something,x. And you're right, there's almost no incremental benefit to having these arrays in zero page.

If for some reason you couldn't easily stripe your arrays into bytes, that's where the "pointer in zero page" helps.

I don't really see any other obvious "cc65 doing something unhelpful" left. edit: that's not true, there's stupid stack things going on because it's placing this 16-bit temporary in X:A and has to store A on the stack to calculate the upper byte. Maybe generating an explicit named 16-bit temporary would perform better than that resulting pha/txa/tax/pla
User avatar
dougeff
Posts: 3078
Joined: Fri May 08, 2015 7:17 pm

Re: cc65 codegen

Post by dougeff »

The final x and y are char sized. You don't need the high byte of anything for any of these calculations.

cast everything to (char) or just use the low byte, and the answer will be the same, but with less code gen.
nesdoug.com -- blog/tutorial on programming for the NES
lidnariq
Posts: 11430
Joined: Sun Apr 13, 2008 11:12 am

Re: cc65 codegen

Post by lidnariq »

That's not true: the upper byte is what is used to detect the "continue" cases.
ajb
Posts: 14
Joined: Thu Apr 08, 2021 5:51 am

Re: cc65 codegen

Post by ajb »

Thanks for the asm analysis lidnariq.

Dougeff, my game levels are >> 256x256 pixels, so I've got to cull and translate into screen coords.
Post Reply