Here's an example.
This was my original code:
Code: Select all
void game_draw_sprites()
{
int i;
for (i=0; i < g_num_active; ++i) {
s16 x = g_x16_0[i]-3-g_scroll_x;
s16 y = g_y16_0[i]-4-g_scroll_y;
if ((x>=0) && (x < SCREEN_X_PIX) && (y>=0) && (y < SCREEN_Y_PIX))
oam_spr(x, y, g_size[i], 0);
}
}
I experimented a bit and got to this:
Code: Select all
void game_draw_sprites()
{
u8 i;
u16 x, y;
for (i=0; i < g_num_active; ++i) {
x = g_x16_0[i]-3-g_scroll_x;
if (x >= SCREEN_X_PIX)
continue;
y = g_y16_0[i]-4-g_scroll_y;
if (y >= SCREEN_Y_PIX)
continue;
oam_spr(x, y, g_size[i], 0);
}
}
The code generated is:
Code: Select all
; ---------------------------------------------------------------
; void __near__ game_draw_sprites (void)
; ---------------------------------------------------------------
.segment "CODE"
.proc _game_draw_sprites: near
.dbg func, "game_draw_sprites", "00", extern, "_game_draw_sprites"
.dbg sym, "i", "00", auto, -1
.dbg sym, "x", "00", auto, -3
.dbg sym, "y", "00", auto, -5
.segment "CODE"
;
; for (i=0; i < g_num_active; ++i) {
;
.dbg line, "..\test.c", 116
jsr decsp5
lda #$00
ldy #$04
L000D: sta (sp),y
cmp _g_num_active
jcs L0003
;
; x = g_x16_0[i]-3-g_scroll_x;
;
.dbg line, "..\test.c", 117
ldx #$00
lda (sp),y
asl a
bcc L000E
inx
clc
L000E: adc #<(_g_x16_0)
sta ptr1
txa
adc #>(_g_x16_0)
sta ptr1+1
ldy #$01
lda (ptr1),y
tax
dey
lda (ptr1),y
sec
sbc #$03
bcs L000F
dex
sec
L000F: sbc _g_scroll_x
pha
txa
sbc _g_scroll_x+1
tax
pla
ldy #$02
jsr staxysp
;
; if (x >= SCREEN_X_PIX)
;
.dbg line, "..\test.c", 118
cmp #$00
txa
sbc #$01
;
; continue;
;
.dbg line, "..\test.c", 119
bcs L0004
;
; y = g_y16_0[i]-4-g_scroll_y;
;
.dbg line, "..\test.c", 120
ldy #$04
ldx #$00
lda (sp),y
asl a
bcc L0010
inx
clc
L0010: adc #<(_g_y16_0)
sta ptr1
txa
adc #>(_g_y16_0)
sta ptr1+1
ldy #$01
lda (ptr1),y
tax
dey
lda (ptr1),y
sec
sbc #$04
bcs L0011
dex
sec
L0011: sbc _g_scroll_y
pha
txa
sbc _g_scroll_y+1
tax
pla
jsr stax0sp
;
; if (y >= SCREEN_Y_PIX)
;
.dbg line, "..\test.c", 121
cmp #$F0
txa
sbc #$00
;
; continue;
;
.dbg line, "..\test.c", 122
bcs L0004
;
; oam_spr(x, y, g_size[i], 0);
;
.dbg line, "..\test.c", 123
jsr decsp3
ldy #$05
lda (sp),y
ldy #$02
sta (sp),y
iny
lda (sp),y
ldy #$01
sta (sp),y
ldy #$07
lda (sp),y
tay
lda _g_size,y
ldy #$00
sta (sp),y
tya
jsr _oam_spr
;
; for (i=0; i < g_num_active; ++i) {
;
.dbg line, "..\test.c", 116
L0004: ldy #$04
clc
lda #$01
adc (sp),y
jmp L000D
;
; }
;
.dbg line, "..\test.c", 125
L0003: jmp incsp5
.dbg line
.endproc
But I'm lacking an understanding of:
1/ how much more can I optimize in c, and what is needed?
2/ if rewritten in straightforward asm, how much faster it would be?
3/ ditto for all-out optimal asm?
Would appreciate some wisdom here.
Thanks!