Files
picovga-RGsB/_picovga/render/vga_tilepersp3.S
2021-06-10 19:07:49 +02:00

395 lines
13 KiB
ArmAsm

// ****************************************************************************
//
// VGA render GF_TILEPERSP3
//
// ****************************************************************************
// data ... tile map
// par ... column of tile images
// par2 ... pointer to 6 matrix integer parameters m11,m12..m23 ((int)(m*FRACTMUL))
// par3 ... LOW8=number of bits of tile width and height, HIGH8=horizon offset
// wb ... LOW8=number of bits of tile map width, HIGH8=number of bits of tile map height
// wrapy ... segment height
#include "../define.h" // common definitions of C and ASM
#include "hardware/regs/sio.h" // registers of hardware divider
#include "hardware/regs/addressmap.h" // SIO base address
#define ACCUM0_OFFSET0 0
#define ACCUM1_OFFSET0 4
#define BASE0_OFFSET0 8
#define BASE1_OFFSET0 12
#define BASE2_OFFSET0 16
#define POP_LANE0_OFFSET0 20
#define POP_LANE1_OFFSET0 24
#define POP_FULL_OFFSET0 28
#define PEEK_LANE0_OFFSET0 32
#define PEEK_LANE1_OFFSET0 36
#define PEEK_FULL_OFFSET0 40
#define CTRL_LANE0_OFFSET0 44
#define CTRL_LANE1_OFFSET0 48
#define ACCUM0_ADD_OFFSET0 52
#define ACCUM1_ADD_OFFSET0 56
#define BASE_1AND0_OFFSET0 60
#define ACCUM0_OFFSET1 64
#define ACCUM1_OFFSET1 68
#define BASE0_OFFSET1 72
#define BASE1_OFFSET1 76
#define BASE2_OFFSET1 80
#define POP_LANE0_OFFSET1 84
#define POP_LANE1_OFFSET1 88
#define POP_FULL_OFFSET1 92
#define PEEK_LANE0_OFFSET1 96
#define PEEK_LANE1_OFFSET1 100
#define PEEK_FULL_OFFSET1 104
#define CTRL_LANE0_OFFSET1 108
#define CTRL_LANE1_OFFSET1 112
#define ACCUM0_ADD_OFFSET1 116
#define ACCUM1_ADD_OFFSET1 120
#define BASE_1AND0_OFFSET1 124
.syntax unified
.section .time_critical.Render, "ax"
.cpu cortex-m0plus
.thumb // use 16-bit instructions
// extern "C" u32* RenderTilePersp3(u32* cbuf, int x, int y, int w, sSegm* segm);
// render tiles with perspective GF_TILEPERSP3, triple pixels
// using hardware interpolator inter0 and inter1 (their state is not saved during interrup)
// R0 ... pointer to destination data buffer
// R1 ... start X coordinate (not used)
// R2 ... start Y coordinate (in graphics lines)
// R3 ... width to display (must be multiple of 4)
// [stack] ... segm video segment sSegm
// Output new pointer to data buffer.
// 320 pixels takes ?? us on 151 MHz.
.thumb_func
.global RenderTilePersp3
RenderTilePersp3:
// Input registers and stack:
// R0 ... pointer to destination data buffer
// R1 ... X coordinate (not used)
// R2 ... Y coordinate
// SP+0: R3 ... remaining width
// SP+4: R4
// SP+8: R5
// SP+12: R6
// SP+16: R7
// SP+20: LR
// SP+24: video segment
// push registers
push {r3-r7,lr}
// ---- prepare registers
// get pointer to video segment -> R4
ldr r4,[sp,#24] // load video segment -> R4
// R0 ... pointer to data buffer
// R2 ... Y coordinate
// R3 ... remaining width
// R4 ... video segment
// load horizon offset -> R1, check if use perspective
ldr r6,RenderTilePersp_pSioBase // get address of SIO base -> R6
ldrh r5,[r4,#SSEGM_WRAPY] // get segment height -> R5
ldrb r1,[r4,#SSEGM_PAR3+1] // get horizon offset -> R1
sxtb r1,r1 // signed extension
lsls r1,#2 // horizon * 4, horizon = 0 ?
bne 2f // use perspective
// not using perspective, start Y coordinate y0 = y - h/2 -> R12
lsrs r5,#1 // segment height/2 -> R5
subs r2,r5 // y - h/2 -> R2
mov r12,r2 // current coordinate Y0 = y - h/2 -> R12
// prepare divide result to get 1<<FRACT
movs r5,#1 // R5 <- 1
str r5,[r6,#SIO_DIV_UDIVISOR_OFFSET] // divisor = 1
lsls r5,#FRACT // constant 1<<FRACT -> R5
str r5,[r6,#SIO_DIV_UDIVIDEND_OFFSET] // dividend = FRACTMUL
b 4f
// using perspective, check ceilling mode
2: bpl 3f // horizon is not negative
subs r2,r5,r2 // negate, y = h - y
subs r2,#1 // y = h - 1 - y
negs r1,r1 // absolute value of horizon
// prepare current coordinate Y0 = y - h -> R12
3: subs r7,r2,r5 // y - h = current Y coordinate -> R7
mov r12,r7 // store current coordinate Y0 -> R12
// start calculating distance coefficient dist = FRACTMUL*h/(y + horiz)
lsls r5,#FRACT // segment height * FRACTMUL -> R5
str r5,[r6,#SIO_DIV_UDIVIDEND_OFFSET] // store dividend, FRACTMUL*h
adds r2,r1 // horizon + y -> R2
str r2,[r6,#SIO_DIV_UDIVISOR_OFFSET] // store divisor, y + horiz
// R0 ... pointer to data buffer
// R3 ... remaining width
// R4 ... video segment
// R12 ... current coordinate Y0
// prepare start coordinate X0 = -w/2 -> LR
4: lsrs r5,r3,#1 // width/2
negs r5,r5 // negate
mov lr,r5 // store start coordinate X0 -> LR
// prepare number of 4-pixels (loop counter) -> R7
lsrs r7,r3,#2 // width/4 -> R7
// prepare address of interpolator 0 base -> R3
ldr r3,RenderTilePersp_Interp // get address of interpolator 0 base -> R3
// R0 ... pointer to data buffer
// R3 ... interpolator base
// R4 ... video segment
// R7 ... width/4
// LR ... start coordinate X0
// R12 ... current coordinate Y0
// ---- setup interpolator 0 to get tile index
// set tile map base to base2
ldr r6,[r4,#SSEGM_DATA] // load tile map base
str r6,[r3,#BASE2_OFFSET0] // set tile map base
// set control word of lane 0: shift=FRACT+tilebits, mask=0..mapwbits-1
ldr r6,RenderTilePersp_Ctrl // load control word
ldrb r1,[r4,#SSEGM_PAR3] // get tile width and height -> R1
str r1,[sp,#0] // save tile size -> [SP+0]
adds r6,r1 // FRACT + tilebits (SIO_INTERP0_CTRL_LANE0_SHIFT_LSB = 0, no shift required)
ldrb r2,[r4,#SSEGM_WB] // number of bits of tile map width mapwbits -> R2
subs r5,r2,#1 // mapwbits - 1
lsls r5,#SIO_INTERP0_CTRL_LANE0_MASK_MSB_LSB // shift to mask MSB position
orrs r6,r5 // add to control word
str r6,[r3,#CTRL_LANE0_OFFSET0] // set control word of lane 0
// set control word of lane 1: shift=FRACT+tilebits-mapwbits,
// mask=mapwbits..mapwbits+maphbits-1
subs r6,r2 // FRACT + tilebits - mapwbits
lsls r2,#SIO_INTERP0_CTRL_LANE0_MASK_LSB_LSB // shift mapwbits to mask LSB position
orrs r6,r2 // add mapwbits to control word
ldrb r2,[r4,#SSEGM_WB+1] // number of bits of tile map height maphbits -> R2
lsls r2,#SIO_INTERP0_CTRL_LANE0_MASK_MSB_LSB // shift maphbits to mask MSB position
adds r6,r2 // add to control word
str r6,[r3,#CTRL_LANE1_OFFSET0] // set control word of lane 1
// ---- setup interpolator 1 to get pixel index
// set tile image to base2
ldr r6,[r4,#SSEGM_PAR] // load tile image base
str r6,[r3,#BASE2_OFFSET1] // set tile image base
// set control word of lane 0: shift=FRACT, mask=0..tilebits-1
ldr r6,RenderTilePersp_Ctrl // load control word
subs r5,r1,#1 // tilebits - 1
lsls r5,#SIO_INTERP1_CTRL_LANE0_MASK_MSB_LSB // shift to mask MSB position
orrs r6,r5 // add to control word
str r6,[r3,#CTRL_LANE0_OFFSET1] // set control word of lane 0
// set control word of lane 1: shift=FRACT-tilebits, mask=tilebits..tilebits*2-1
subs r6,r1 // FRACT - tilebits
lsls r5,r1,#SIO_INTERP1_CTRL_LANE0_MASK_LSB_LSB // shift to mask LSB position
orrs r6,r5 // add tilebits to control word
lsls r1,#SIO_INTERP1_CTRL_LANE0_MASK_MSB_LSB // shift tilebits to mask MSB position
adds r6,r1 // add to control word
str r6,[r3,#CTRL_LANE1_OFFSET1] // set control word of lane 1
// R0 ... pointer to data buffer
// R3 ... interpolator base
// R4 ... video segment
// R7 ... width/4
// LR ... start coordinate X0
// R12 ... current coordinate Y0
// [SP+0] ... number of bits of tile width and height
// ---- set matrix
// get pointer to matrix -> R4
ldr r4,[r4,#SSEGM_PAR2] // get pointer to matrix -> R4
// get distance coefficient dist -> R1
ldr r1,RenderTilePersp_pSioBase // get address of SIO base -> R1
ldr r1,[r1,#SIO_DIV_QUOTIENT_OFFSET] // get quotient-> R1, distance coefficient
// r4+0 ... m11
// r4+4 ... m12
// r4+8 ... m13
// r4+12 ... m21
// r4+16 ... m22
// r4+20 ... m23
// set m11 -> R5 base0
ldr r5,[r4,#0] // load m11
muls r5,r1 // m11*dist
asrs r5,#FRACT // (m11*dist)>>FRACT ... delta
lsls r2,r5,#1 // delta*2
adds r2,r5 // delta*3
str r2,[r3,#BASE0_OFFSET0] // set base0
str r2,[r3,#BASE0_OFFSET1] // set base0
// set m21 -> R6 base1
ldr r6,[r4,#12] // load m21
muls r6,r1 // m21*dist
asrs r6,#FRACT // (m21*dist)>>FRACT ... delta
lsls r2,r6,#1 // delta*2
adds r2,r6 // delta*3
str r2,[r3,#BASE1_OFFSET0] // set base1
str r2,[r3,#BASE1_OFFSET1] // set base1
// R0 ... pointer to data buffer
// R1 ... distance coefficient
// R3 ... interpolator base
// R4 ... pointer to matrix
// R5 ... m11
// R6 ... m21
// R7 ... width/4
// LR ... start coordinate X0
// R12 ... current coordinate Y0
// [SP+0] ... number of bits of tile width and height
// set x0*m11 + y0*m12 + m13 -> accum0
mov r2,lr // start coordinate X0 -> X2
muls r5,r2 // x0*m11 -> R5
muls r2,r6 // x0*m21 -> R2
mov lr,r1 // save distance coefficient -> LR
ldr r6,[r4,#4] // load m12 -> R6
muls r1,r6 // m12*dist -> R1
asrs r1,#FRACT // (m12*dist)>>FRACT -> R1
mov r6,r12 // load coordinate Y0 -> R6
muls r1,r6 // y0*m12 -> R1
adds r5,r1 // x0*m11 + y0*m12 -> R5
ldr r1,[r4,#8] // load m13 -> R1
adds r5,r1 // x0*m11 + y0*m12 + m13 -> R5
str r5,[r3,#ACCUM0_OFFSET0] // set accum0
str r5,[r3,#ACCUM0_OFFSET1] // set accum0
// R0 ... pointer to data buffer
// R2 ... x0*m21
// R3 ... interpolator base
// R4 ... pointer to matrix
// R6 ... current coordinate Y0
// R7 ... width/4
// LR ... distance coefficient
// [SP+0] ... number of bits of tile width and height
// set x0*m21 + y0*m22 + m23 -> accum1
ldr r1,[r4,#16] // load m22 -> R1
mov r5,lr // distance coefficient -> R5
muls r1,r5 // m22*dist
asrs r1,#FRACT // (m22*dist)>>FRACT -> R1
muls r1,r6 // y0*m22 -> R1
adds r2,r1 // x0*m21 + y0*m22 -> R2
ldr r1,[r4,#20] // load m23 -> R1
adds r2,r1 // x0*m21 + y0*m22 + m23 -> R2
str r2,[r3,#ACCUM1_OFFSET0] // set accum1
str r2,[r3,#ACCUM1_OFFSET1] // set accum1
// ---- process odd 4-pixel
// prepare tile bits * 2
ldr r6,[sp,#0] // get tile bits
lsls r6,#1 // tile bits * 2
// R0 ... pointer to destination data buffer
// R1 ... (temporary - pixel accumulator 1)
// R2 ... (temporary - pixel accumulator 2)
// R3 ... interpolator base
// R4 ... (temporary - get pointer to tile map, load tile index)
// R5 ... (temporary - get pointer to pixel, load pixel)
// R6 ... tilebits*2
// R7 ... width/4 (loop counter)
// [SP+0] ... number of bits of tile width and height
// check odd 4-pixels
lsrs r7,#1 // width/4/2
bcc 2f // no odd 4-pixel
// load pixel
ldr r4,[r3,#POP_FULL_OFFSET0] // [1] get pointer to tile map
ldrb r4,[r4,#0] // [2] load tile index
lsls r4,r6 // [1] tile index * tile size
ldr r5,[r3,#POP_FULL_OFFSET1] // [1] get pointer to tile image
ldrb r1,[r5,r4] // [2] load pixel
lsls r4,r1,#8 // [1] shift 1 byte left
orrs r1,r4 // [1] add pixel to accumulator
lsls r4,r1,#16 // [1] shift 2 bytes left
orrs r1,r4 // [1] add pixel to accumulator
// [2] store 4 pixels
stmia r0!,{r1} // [2] store 4 pixels
// check number of remaining pixels
2: tst r7,r7 // check number of pixels
beq 8f // end
// ---- [37 per 8 pixels] inner loop
// R0 ... pointer to destination data buffer
// R1 ... (temporary - pixel accumulator 1)
// R2 ... (temporary - pixel accumulator 2)
// R3 ... interpolator base
// R4 ... (temporary - get pointer to tile map, load tile index)
// R5 ... (temporary - get pointer to pixel, load pixel)
// R6 ... tilebits*2
// R7 ... width/8 (loop counter)
// [9] load 1st pixel
6: ldr r4,[r3,#POP_FULL_OFFSET0] // [1] get pointer to tile map
ldrb r4,[r4,#0] // [2] load tile index
lsls r4,r6 // [1] tile index * tile size
ldr r5,[r3,#POP_FULL_OFFSET1] // [1] get pointer to tile image
ldrb r1,[r5,r4] // [2] load pixel
lsls r4,r1,#8 // [1] shift 1 byte left
orrs r1,r4 // [1] add pixel to accumulator
// [11] load 2nd pixel
ldr r4,[r3,#POP_FULL_OFFSET0] // [1] get pointer to tile map
ldrb r4,[r4,#0] // [2] load tile index
lsls r4,r6 // [1] tile index * tile size
ldr r5,[r3,#POP_FULL_OFFSET1] // [1] get pointer to tile image
ldrb r4,[r5,r4] // [2] load pixel
lsls r4,#16 // [1] shift 2 bytes left
orrs r1,r4 // [1] add pixel to accumulator
lsls r4,#8 // [1] shift 1 byte left
orrs r1,r4 // [1] add pixel to accumulator
// [11] load pixel
ldr r4,[r3,#POP_FULL_OFFSET0] // [1] get pointer to tile map
ldrb r4,[r4,#0] // [2] load tile index
lsls r4,r6 // [1] tile index * tile size
ldr r5,[r3,#POP_FULL_OFFSET1] // [1] get pointer to tile image
ldrb r2,[r5,r4] // [2] load pixel
lsls r4,r2,#8 // [1] shift 1 byte left
orrs r2,r4 // [1] add pixel to accumulator
lsls r4,r2,#16 // [1] shift 2 bytes left
orrs r2,r4 // [1] add pixel to accumulator
// [3] store 8 pixels
stmia r0!,{r1,r2} // [3] store 8 pixels
// [2,3] loop counter
subs r7,#1 // [1] 8-pixel counter
bne 6b // [1,2] next 8-pixels
// pop registers
8: pop {r3-r7,pc}
.align 2
// pointer to SIO base
RenderTilePersp_pSioBase:
.word SIO_BASE // addres of SIO base
// pointer to Interp0 base
RenderTilePersp_Interp:
.word SIO_BASE+SIO_INTERP0_ACCUM0_OFFSET // addres of interpolator 0 base
RenderTilePersp_Ctrl: // lane control word
.word SIO_INTERP0_CTRL_LANE0_ADD_RAW_BITS | (FRACT<<SIO_INTERP0_CTRL_LANE0_SHIFT_LSB)