;* Here are the texture-mapping inner loops in pure assembly ;* language. ;* ;* Since discovering the Win32 Demos FAQ, self-modifying code ;* no longer sits in the data segment. BITS 32 ; Segment/section definition macros. %ifdef M_TARGET_WATCOM SEGMENT DATA PUBLIC ALIGN=16 CLASS=DATA USE32 SEGMENT DATA %else SECTION .data %endif %define SPACEFILLER4 (0x44444444) ; If you change this in r_draw.c, be sure to change it here, too! FUZZTABLE equ 64 %ifdef M_TARGET_LINUX EXTERN columnofs EXTERN ylookup EXTERN centery EXTERN fuzzpos EXTERN fuzzoffset EXTERN DefaultPalette EXTERN realviewheight EXTERN dc_pitch EXTERN dc_colormap EXTERN dc_iscale EXTERN dc_texturefrac EXTERN dc_source EXTERN dc_yl EXTERN dc_yh EXTERN dc_x EXTERN dc_mask EXTERN dc_ctspan EXTERN dc_temp EXTERN ds_colsize EXTERN ds_xstep EXTERN ds_ystep EXTERN ds_colormap EXTERN ds_source EXTERN ds_x1 EXTERN ds_x2 EXTERN ds_xfrac EXTERN ds_yfrac EXTERN ds_y GLOBAL ds_cursource GLOBAL ds_curcolormap %else EXTERN _columnofs EXTERN _ylookup EXTERN _centery EXTERN _fuzzpos EXTERN _fuzzoffset EXTERN _DefaultPalette EXTERN _realviewheight EXTERN _dc_pitch EXTERN _dc_colormap EXTERN _dc_iscale EXTERN _dc_texturefrac EXTERN _dc_source EXTERN _dc_yl EXTERN _dc_yh EXTERN _dc_x EXTERN _dc_mask EXTERN _dc_ctspan EXTERN _dc_temp EXTERN _ds_colsize EXTERN _ds_xstep EXTERN _ds_ystep EXTERN _ds_colormap EXTERN _ds_source EXTERN _ds_x1 EXTERN _ds_x2 EXTERN _ds_xfrac EXTERN _ds_yfrac EXTERN _ds_y GLOBAL _ds_cursource GLOBAL _ds_curcolormap %define columnofs _columnofs %define ylookup _ylookup %define centery _centery %define fuzzpos _fuzzpos %define fuzzoffset _fuzzoffset %define DefaultPalette _DefaultPalette %define realviewheight _realviewheight %define dc_pitch _dc_pitch %define dc_colormap _dc_colormap %define dc_iscale _dc_iscale %define dc_texturefrac _dc_texturefrac %define dc_source _dc_source %define dc_yl _dc_yl %define dc_yh _dc_yh %define dc_x _dc_x %define dc_mask _dc_mask %define dc_ctspan _dc_ctspan %define dc_temp _dc_temp %define ds_colsize _ds_colsize %define ds_xstep _ds_xstep %define ds_ystep _ds_ystep %define ds_colormap _ds_colormap %define ds_source _ds_source %define ds_x1 _ds_x1 %define ds_x2 _ds_x2 %define ds_xfrac _ds_xfrac %define ds_yfrac _ds_yfrac %define ds_y _ds_y %endif EXTERN PatchUnrolled _ds_cursource: ds_cursource: DD 0 _ds_curcolormap: ds_curcolormap: DD 0 ; Local stuff: lastAddress DD 0 pixelcount DD 0 %ifdef M_TARGET_WATCOM SEGMENT DATA PUBLIC ALIGN=16 CLASS=CODE USE32 SEGMENT DATA %else SECTION .data %endif GLOBAL ASM_PatchColSize GLOBAL _ASM_PatchColSize GLOBAL @ASM_PatchColSize@0 ASM_PatchColSize: _ASM_PatchColSize: @ASM_PatchColSize@0: mov ecx,[ds_colsize] mov edx,[ds_colsize] neg ecx mov [spadva+2],edx mov [spstb+2],dl add edx,edx mov [spadvb+2],edx add edx,edx mov [spsta+2],cl mov [spstd+2],cl add cl,cl mov [spadvc+2],edx mov [spstc+2],cl ret GLOBAL @R_SetSpanSource_ASM@4 GLOBAL _R_SetSpanSource_ASM GLOBAL R_SetSpanSource_ASM R_SetSpanSource_ASM: _R_SetSpanSource_ASM: mov ecx,[esp+4] @R_SetSpanSource_ASM@4: mov [spreada+2],ecx mov [spreadb+2],ecx mov [spreadc+2],ecx mov [spreadd+2],ecx mov [spreade+2],ecx mov [spreadf+2],ecx mov [spreadg+2],ecx mov [ds_cursource],ecx ret GLOBAL @R_SetSpanColormap_ASM@4 GLOBAL _R_SetSpanColormap_ASM GLOBAL R_SetSpanColormap_ASM R_SetSpanColormap_ASM: _R_SetSpanColormap_ASM: mov ecx,[esp+4] @R_SetSpanColormap_ASM@4: mov [spmapa+2],ecx mov [spmapb+2],ecx mov [spmapc+2],ecx mov [spmapd+2],ecx mov [spmape+2],ecx mov [spmapf+2],ecx mov [spmapg+2],ecx mov [ds_curcolormap],ecx aret: ret GLOBAL @R_DrawSpanP_ASM@0 GLOBAL _R_DrawSpanP_ASM GLOBAL R_DrawSpanP_ASM ; eax: scratch ; ebx: zero ; ecx: xfrac at top end, yfrac int part in low end ; edx: yfrac frac part at top end ; edi: dest ; ebp: scratch ; esi: count align 16 @R_DrawSpanP_ASM@0: _R_DrawSpanP_ASM: R_DrawSpanP_ASM: mov eax,[ds_x2] mov ecx,[ds_x1] sub eax,ecx mov edx,[columnofs] jl aret ; count < 0: nothing to do, so leave push ebx push edi push ebp push esi mov edi,[edx+ecx*4] mov edx,[ylookup] mov ecx,[ds_y] add edi,[edx+ecx*4] mov edx,[ds_ystep] xor ebx,ebx shl edx,6 mov ecx,[ds_ystep] shr ecx,26 lea esi,[eax+1] or ecx,[ds_xstep] mov [ds_ystep],edx mov [ds_xstep],ecx mov ecx,[ds_yfrac] shr ecx,26 mov edx,[ds_yfrac] shl edx,6 or ecx,[ds_xfrac] shr esi,1 jnc dseven1 ; do odd pixel mov ebp,ecx rol ebp,6 and ebp,0xfff add edx,[ds_ystep] adc ecx,[ds_xstep] spreada mov bl,[ebp+SPACEFILLER4] spmapa mov bl,[ebx+SPACEFILLER4] mov [edi],bl spadva add edi,1 dseven1 shr esi,1 jnc dsrest ; do two more pixels mov ebp,ecx add edx,[ds_ystep] adc ecx,[ds_xstep] and ebp,0xfc00003f rol ebp,6 mov eax,ecx add edx,[ds_ystep] adc ecx,[ds_xstep] spreadb mov bl,[ebp+SPACEFILLER4] ;read texel1 rol eax,6 and eax,0xfff spmapb mov bl,[ebx+SPACEFILLER4] ;map texel1 mov [edi],bl ;store texel1 spadvb add edi,2 spreadc mov bl,[eax+SPACEFILLER4] ;read texel2 spmapc mov bl,[ebx+SPACEFILLER4] ;map texel2 spsta mov [edi-1],bl ;store texel2 ; do the rest dsrest test esi,esi jz near dsdone align 16 dsloop mov ebp,ecx spstep1d add edx,[ds_ystep] spstep2d adc ecx,[ds_xstep] and ebp,0xfc00003f rol ebp,6 mov eax,ecx spstep1e add edx,[ds_ystep] spstep2e adc ecx,[ds_xstep] spreadd mov bl,[ebp+SPACEFILLER4] ;read texel1 rol eax,6 and eax,0xfff spmapd mov bl,[ebx+SPACEFILLER4] ;map texel1 mov [edi],bl ;store texel1 mov ebp,ecx spreade mov bl,[eax+SPACEFILLER4] ;read texel2 spstep1f add edx,[ds_ystep] spstep2f adc ecx,[ds_xstep] and ebp,0xfc00003f rol ebp,6 spmape mov bl,[ebx+SPACEFILLER4] ;map texel2 mov eax,ecx spstb mov [edi+1],bl ;store texel2 spreadf mov bl,[ebp+SPACEFILLER4] ;read texel3 spmapf mov bl,[ebx+SPACEFILLER4] ;map texel3 spadvc add edi,4 rol eax,6 and eax,0xfff spstc mov [edi-2],bl ;store texel3 spreadg mov bl,[eax+SPACEFILLER4] ;read texel4 spstep1g add edx,[ds_ystep] spstep2g adc ecx,[ds_xstep] spmapg mov bl,[ebx+SPACEFILLER4] ;map texel4 dec esi spstd mov [edi-1],bl ;store texel4 jnz near dsloop dsdone pop esi pop ebp pop edi pop ebx rdspret ret ;************************ GLOBAL @ASM_PatchPitch@0 GLOBAL _ASM_PatchPitch GLOBAL ASM_PatchPitch ASM_PatchPitch: _ASM_PatchPitch: @ASM_PatchPitch@0: mov edx,[dc_pitch] ; 1 * dc_pitch mov [rdcp1+2],edx mov [f1a+3],edx mov [f1b+2],edx ; 2 * dc_pitch add edx,[dc_pitch] mov [f2a+3],edx mov [f2b+2],edx ; 3 * dc_pitch add edx,[dc_pitch] mov [f3a+3],edx mov [f3b+2],edx ; 4 * dc_pitch add edx,[dc_pitch] mov [f4+2],edx jmp PatchUnrolled ;*---------------------------------------------------------------------- ;* ;* R_DrawColumnP ;* ;*---------------------------------------------------------------------- GLOBAL @R_DrawColumnP_ASM@0 GLOBAL _R_DrawColumnP_ASM GLOBAL R_DrawColumnP_ASM align 16 R_DrawColumnP_ASM: _R_DrawColumnP_ASM: @R_DrawColumnP_ASM@0: ; count = dc_yh - dc_yl; mov eax,[dc_yh] mov ecx,[dc_yl] sub eax,ecx mov edx,[ylookup] jl near rdcpret ; count < 0: nothing to do, so leave push ebp ; save registers push ebx push edi push esi ; dest = ylookup[dc_yl] + columnofs[dc_x]; mov edi,[edx+ecx*4] mov edx,[columnofs] mov ebx,[dc_x] inc eax ; make 0 count mean 0 pixels imul eax,[dc_pitch] ; Start turning the counter into an index add edi,[edx+ebx*4] ; edi = top of destination column add edi,eax ; Point edi to the bottom of the destination column mov edx,[dc_iscale] shr edx,16 mov ecx,[dc_texturefrac] ; ecx = frac (all 32 bits) mov ebx,ecx mov ebp,0 shl ebx,16 sub ebp,eax ; ebp = counter (counts up to 0 in pitch increments) shr ecx,16 mov eax,[dc_mask] and ecx,eax or ebx,edx ; Put fracstep integral part into bl shl eax,8 mov edx,[dc_iscale] shl edx,16 mov esi,[dc_source] or ebx,eax ; Put mask byte into bh mov eax,[dc_colormap] sub edi,[dc_pitch] ; The registers should now look like this: ; ; [31 .. 16][15 .. 8][7 .. 0] ; eax [colormap ][texel ] ; ebx [yf ][mask ][dyi ] ; ecx [0 ][yi ] ; edx [dyf ][ ][ ] ; esi [source texture column ] ; edi [destination screen pointer ] ; ebp [counter (adds up) ] ; ; Unfortunately, this arrangement is going to produce ; lots of partial register stalls on anything better ; than a Pentium. align 16 rdcploop: mov al,[esi+ecx] ; Fetch texel add ebx,edx ; increment frac fractional part adc cl,bl ; increment frac integral part rdcp1: add ebp,SPACEFILLER4 ; increment counter mov al,[eax] ; colormap texel and cl,bh mov [edi+ebp],al ; Store texel test ebp,ebp jnz rdcploop ; loop pop esi pop edi pop ebx pop ebp rdcpret: ret ;*---------------------------------------------------------------------- ;* ;* R_DrawFuzzColumnP ;* ;* This code assumes that the fuzztable is some 2^n size, which it is ;* not in the original DOOM. ;* ;*---------------------------------------------------------------------- GLOBAL @R_DrawFuzzColumnP_ASM@0 GLOBAL _R_DrawFuzzColumnP_ASM GLOBAL R_DrawFuzzColumnP_ASM align 16 R_DrawFuzzColumnP_ASM: _R_DrawFuzzColumnP_ASM: @R_DrawFuzzColumnP_ASM@0: ; Adjust borders. Low... mov eax,[dc_yl] push ebx push esi push edi cmp eax,0 jg .ylok mov eax,1 ; ...and high. .ylok mov edx,[realviewheight] mov esi,[dc_yh] lea ecx,[edx-1] cmp esi,ecx jl .yhok lea esi,[edx-2] .yhok sub esi,eax ; esi = count js near dfcdone ; Zero length (or less) mov ecx,[ylookup] mov edx,[dc_x] ; AGI stall mov edi,[ecx+eax*4] mov ecx,[columnofs] ; AGI stall mov ebx,[ecx+edx*4] mov eax,[DefaultPalette] mov ecx,[fuzzpos] add edi,ebx inc esi mov eax,[eax+8] mov ebx,esi add eax,6*256 shr esi,2 shl ebx,8 test bh,3 jz fquadloop ; ; esi = count ; edi = dest ; ecx = fuzzpos ; eax = colormap 6 (256-byte aligned) ; ; do odd pixel (if any) test bh,1 jz .oddid mov edx,[fuzzoffset+ecx*4] inc ecx mov al,[edi+edx] and ecx,FUZZTABLE-1 mov bl,[eax] mov [edi],bl add edi,[dc_pitch] ; do two non-dword aligned pixels (if any) .oddid test bh,2 jz .undid mov edx,[fuzzoffset+ecx*4] inc ecx mov al,[edi+edx] and ecx,FUZZTABLE-1 mov bl,[eax] mov edx,[fuzzoffset+ecx*4] mov [edi],bl add edi,[dc_pitch] inc ecx mov al,[edi+edx] and ecx,FUZZTABLE-1 mov bl,[eax] mov [edi],bl add edi,[dc_pitch] ; make sure we still have some pixels left to do .undid test esi,esi jz savefuzzpos fquadloop: mov edx,[fuzzoffset+ecx*4] inc ecx mov al,[edi+edx] ; AGI stall and ecx,FUZZTABLE-1 mov bl,[eax] ; AGI stall mov edx,[fuzzoffset+ecx*4] mov [edi],bl inc ecx f1a: mov al,[edi+edx+SPACEFILLER4] and ecx,FUZZTABLE-1 mov bl,[eax] mov edx,[fuzzoffset+ecx*4] f1b: mov [edi+SPACEFILLER4],bl inc ecx f2a: mov al,[edi+edx+2*SPACEFILLER4] and ecx,FUZZTABLE-1 mov bl,[eax] mov edx,[fuzzoffset+ecx*4] f2b: mov [edi+2*SPACEFILLER4],bl inc ecx f3a: mov al,[edi+edx+3*SPACEFILLER4] and ecx,FUZZTABLE-1 mov bl,[eax] dec esi f3b: mov [edi+3*SPACEFILLER4],bl f4: lea edi,[edi+4*SPACEFILLER4] jnz fquadloop savefuzzpos: add ecx,3 and ecx,FUZZTABLE-1 mov [fuzzpos],ecx dfcdone: pop edi pop esi pop ebx ret ;*---------------------------------------------------------------------- ;* ;* R_DrawColumnHorizP_ASM ;* ;*---------------------------------------------------------------------- GLOBAL @R_DrawColumnHorizP_ASM@0 GLOBAL _R_DrawColumnHorizP_ASM GLOBAL R_DrawColumnHorizP_ASM align 16 @R_DrawColumnHorizP_ASM@0: R_DrawColumnHorizP_ASM: _R_DrawColumnHorizP_ASM: ; count = dc_yh - dc_yl; mov eax,[dc_yh] mov ecx,[dc_yl] sub eax,ecx mov edx,[dc_x] jl near .leave ; count < 0: nothing to do, so leave push ebp ; save registers push ebx push edi push esi ; dest = ylookup[dc_yl] + columnofs[dc_x]; inc eax ; make 0 count mean 0 pixels and edx,3 push eax mov esi,[dc_ctspan+edx*4] lea eax,[dc_temp+ecx*4+edx] ; eax = top of column in buffer mov ebp,[dc_yh] mov [esi],ecx mov [esi+4],ebp add esi,8 mov [dc_ctspan+edx*4],esi mov esi,[dc_iscale] mov ecx,[dc_texturefrac] ; ecx = frac (all 32 bits) shl ecx,8 mov ebx,[dc_mask] shl esi,8 mov edi,[dc_source] or ecx,ebx mov dl,[edi] ; load cache mov ebx,[esp] and ebx,0xfffffff8 jnz .mthan8 ; Register usage in the following code is: ; ; eax: dest ; edi: source ; ecx: frac (8.16)/mask (..8) ; esi: fracstep (8.24) ; ebx: add1 ; ebp: add2 ; dl: texel1 ; dh: texel2 ;[esp] count ; there are fewer than 8 pixels to draw mov ebx,[esp] .lthan8 shr ebx,1 jnc .even ; do one pixel before loop (little opportunity for pairing) mov ebp,ecx ; copy frac to ebx add ecx,esi ; increment frac shr ebp,24 ; shift frac over to low byte add eax,4 and ebp,ecx ; mask it mov dl,[edi+ebp] mov [eax-4],dl .even test ebx,ebx jz near .done .loop2 mov [esp],ebx ; save counter mov ebx,ecx ; copy frac for texel1 to ebx shr ebx,24 ; shift frac for texel1 to low byte add ecx,esi ; increment frac mov ebp,ecx ; copy frac for texel2 to ebp and ebx,ecx ; mask frac for texel1 shr ebp,24 ; shift frac for texel2 to low byte add ecx,esi ; increment frac and ebp,ecx ; mask frac for texel2 mov dl,[edi+ebx] ; read texel1 mov ebx,[esp] ; fetch counter mov dh,[edi+ebp] ; read texel2 mov [eax],dl ; write texel1 mov [eax+4],dh ; write texel2 add eax,8 ; increment dest dec ebx ; decrement counter jnz .loop2 ; loop until it hits 0 jmp .done ; there are more than 8 pixels to draw. position eax as close to a 32 byte ; boundary as possible, then do whatever is left. .mthan8 test eax,4 jz .try2 mov ebp,ecx ; frac: in ebp add ecx,esi ; step shr ebp,24 ; frac: shift add eax,4 ; increment dest and ebp,ecx ; frac: mask mov ebx,[esp] ; fetch counter mov dl,[edi+ebp] ; tex: read dec ebx ; decrement counter mov [eax-4],dl ; tex: write mov [esp],ebx ; store counter .try2 test eax,8 jz .try4 mov ebx,ecx ; frac1: in ebx add ecx,esi ; step shr ebx,24 ; frac1: shift mov ebp,ecx ; frac2: in ebp shr ebp,24 ; frac2: shift and ebx,ecx ; frac1: mask and ebp,ecx ; frac2: mask add ecx,esi ; step mov dl,[edi+ebx] ; tex1: read mov ebx,[esp] ; fetch counter mov dh,[edi+ebp] ; tex2: read mov [eax],dl ; tex1: write mov [eax+4],dh ; tex2: write sub ebx,2 ; decrement counter add eax,8 ; increment dest mov [esp],ebx ; store counter .try4 test eax,16 jz .try8 mov ebx,ecx ; frac1: in ebx add ecx,esi ; step shr ebx,24 ; frac1: shift mov ebp,ecx ; frac2: in ebp shr ebp,24 ; frac2: shift and ebx,ecx ; frac1: mask and ebp,ecx ; frac2: mask add ecx,esi ; step mov dl,[edi+ebx] ; tex1: read mov ebx,ecx ; frac3: in ebx shr ebx,24 ; frac3: shift mov dh,[edi+ebp] ; tex2: read add ecx,esi ; step mov [eax],dl ; tex1: write mov [eax+4],dh ; tex2: write mov ebp,ecx ; frac4: in ebp shr ebp,24 ; frac4: shift and ebx,ecx ; frac3: mask and ebp,ecx ; frac4: mask add ecx,esi ; step mov dl,[edi+ebx] ; tex3: read mov ebx,[esp] ; fetch counter mov dh,[edi+ebp] ; tex4: read sub ebx,4 ; decrement counter mov [esp],ebx ; store counter mov [eax+8],dl ; tex3: write mov [eax+12],dh ; tex4: write add eax,16 ; increment dest .try8 mov ebx,[esp] ; make counter count groups of 8 sub esp,4 shr ebx,3 jmp .tail8 align 16 .loop8 mov [esp],ebx ; save counter mov ebx,ecx ; frac1: in ebx shr ebx,24 ; frac1: shift add ecx,esi ; step mov ebp,ecx ; frac2: in ebp and ebx,ecx ; frac1: mask shr ebp,24 ; frac2: shift add ecx,esi ; step and ebp,ecx ; frac2: mask mov dl,[edi+ebx] ; tex1: read mov ebx,ecx ; frac3: in ebx mov dh,[edi+ebp] ; tex2: read shr ebx,24 ; frac3: shift add ecx,esi ; step mov [eax],dl ; tex1: write mov [eax+4],dh ; tex2: write mov ebp,ecx ; frac4: in ebp and ebx,ecx ; frac3: mask shr ebp,24 ; frac4: shift add ecx,esi ; step and ebp,ecx ; frac4: mask mov dl,[edi+ebx] ; tex3: read mov ebx,ecx ; frac5: in ebx mov dh,[edi+ebp] ; tex4: read shr ebx,24 ; frac5: shift mov [eax+8],dl ; tex3: write mov [eax+12],dh ; tex4: write add ecx,esi ; step mov ebp,ecx ; frac6: in ebp and ebx,ecx ; frac5: mask shr ebp,24 ; frac6: shift mov dl,[edi+ebx] ; tex5: read and ebp,ecx ; frac6: mask add ecx,esi ; step mov ebx,ecx ; frac7: in ebx mov [eax+16],dl ; tex5: write shr ebx,24 ; frac7: shift mov dh,[edi+ebp] ; tex6: read and ebx,ecx ; frac7: mask add ecx,esi ; step mov ebp,ecx ; frac8: in ebp mov [eax+20],dh ; tex6: write shr ebp,24 ; frac8: shift add eax,32 ; increment dest pointer and ebp,ecx ; frac8: mask mov dl,[edi+ebx] ; tex7: read mov ebx,[esp] ; fetch counter mov [eax-8],dl ; tex7: write mov dh,[edi+ebp] ; tex8: read add ecx,esi ; step mov [eax-4],dh ; tex8: write mov dl,[eax] ; load cache dec ebx ; decrement counter .tail8 jnz near .loop8 ; loop if more to do pop ebp mov ebx,[esp] and ebx,7 jnz near .lthan8 .done pop eax pop esi pop edi pop ebx pop ebp .leave ret ;*---------------------------------------------------------------------- ;* ;* rt_copy1col_asm ;* ;* ecx = hx ;* edx = sx ;* [esp+4] = yl ;* [esp+8] = yh ;* ;*---------------------------------------------------------------------- GLOBAL @rt_copy1col_asm@16 GLOBAL _rt_copy1col_asm GLOBAL rt_copy1col_asm align 16 rt_copy1col_asm: _rt_copy1col_asm: pop eax mov edx,[esp+4*3] mov ecx,[esp+4*2] push edx push ecx mov ecx,[esp+4*2] mov edx,[esp+4*3] push eax @rt_copy1col_asm@16: mov eax, [esp+4] push ebx mov ebx, [esp+12] push esi sub ebx, eax push edi js .done lea esi,[eax*4] mov eax,[columnofs] inc ebx ; ebx = count mov eax,[eax+edx*4] mov edx,[ylookup] lea ecx,[dc_temp+ecx+esi] ; ecx = source mov edi,[edx+esi] mov esi,[dc_pitch] ; esi = pitch add eax,edi ; eax = dest shr ebx,1 jnc .even mov dl,[ecx] add ecx,4 mov [eax],dl add eax,esi .even and ebx,ebx jz .done .loop mov dl,[ecx] mov dh,[ecx+4] mov [eax],dl mov [eax+esi],dh add ecx,8 lea eax,[eax+esi*2] dec ebx jnz .loop .done pop edi pop esi pop ebx ret 8 ;*---------------------------------------------------------------------- ;* ;* rt_copy2cols_asm ;* ;* ecx = hx ;* edx = sx ;* [esp+4] = yl ;* [esp+8] = yh ;* ;*---------------------------------------------------------------------- GLOBAL @rt_copy2cols_asm@16 GLOBAL _rt_copy2cols_asm GLOBAL rt_copy2cols_asm align 16 _rt_copy2cols_asm: rt_copy2cols_asm: pop eax mov edx,[esp+4*3] mov ecx,[esp+4*2] push edx push ecx mov ecx,[esp+4*2] mov edx,[esp+4*3] push eax @rt_copy2cols_asm@16: mov eax, [esp+4] push ebx mov ebx, [esp+12] push esi sub ebx, eax push edi js .done lea esi,[eax*4] mov eax,[columnofs] inc ebx ; ebx = count mov eax,[eax+edx*4] mov edx,[ylookup] lea ecx,[dc_temp+ecx+esi] ; ecx = source mov edi,[edx+esi] mov edx,[dc_pitch] ; edx = pitch add eax,edi ; eax = dest shr ebx,1 jnc .even mov si,[ecx] add ecx,4 mov [eax],si add eax,edx .even and ebx,ebx jz .done .loop mov si,[ecx] mov di,[ecx+4] mov [eax],si mov [eax+edx],di add ecx,8 lea eax,[eax+edx*2] dec ebx jnz .loop .done pop edi pop esi pop ebx ret 8 ;*---------------------------------------------------------------------- ;* ;* rt_copy4cols_asm ;* ;* ecx = sx ;* edx = yl ;* [esp+4] = yh ;* ;*---------------------------------------------------------------------- GLOBAL @rt_copy4cols_asm@12 GLOBAL _rt_copy4cols_asm GLOBAL rt_copy4cols_asm align 16 rt_copy4cols_asm: _rt_copy4cols_asm: pop eax mov ecx,[esp+8] mov edx,[esp+4] push ecx mov ecx,[esp+4] push eax @rt_copy4cols_asm@12: push ebx mov ebx,[esp+8] push esi sub ebx,edx push edi js .done mov eax,[columnofs] inc ebx ; ebx = count mov eax,[eax+ecx*4] mov ecx,[ylookup] mov esi,[ecx+edx*4] lea ecx,[dc_temp+edx*4] ; ecx = source mov edx,[dc_pitch] ; edx = pitch add eax,esi ; eax = dest shr ebx,1 jnc .even mov esi,[ecx] add ecx,4 mov [eax],esi add eax,edx .even and ebx,ebx jz .done .loop mov esi,[ecx] mov edi,[ecx+4] mov [eax],esi mov [eax+edx],edi add ecx,8 lea eax,[eax+edx*2] dec ebx jnz .loop .done pop edi pop esi pop ebx ret 4 ;*---------------------------------------------------------------------- ;* ;* rt_map1col_asm ;* ;* ecx = hx ;* edx = sx ;* [esp+4] = yl ;* [esp+8] = yh ;* ;*---------------------------------------------------------------------- GLOBAL @rt_map1col_asm@16 GLOBAL _rt_map1col_asm GLOBAL rt_map1col_asm align 16 rt_map1col_asm: _rt_map1col_asm: pop eax mov edx,[esp+4*3] mov ecx,[esp+4*2] push edx push ecx mov ecx,[esp+4*2] mov edx,[esp+4*3] push eax @rt_map1col_asm@16: mov eax,[esp+4] push ebx mov ebx,[esp+12] push ebp push esi sub ebx, eax push edi js .done lea edi,[eax*4] mov eax,[columnofs] mov esi,[dc_colormap] ; esi = colormap inc ebx ; ebx = count mov eax,[eax+edx*4] mov edx,[ylookup] lea ebp,[dc_temp+ecx+edi] ; ebp = source mov ecx,[edx+edi] mov edi,[dc_pitch] ; edi = pitch add eax,ecx ; eax = dest xor ecx,ecx xor edx,edx shr ebx,1 jnc .even mov dl,[ebp] add ebp,4 mov dl,[esi+edx] mov [eax],dl add eax,edi .even and ebx,ebx jz .done .loop mov dl,[ebp] mov cl,[ebp+4] add ebp,8 mov dl,[esi+edx] mov cl,[esi+ecx] mov [eax],dl mov [eax+edi],cl dec ebx lea eax,[eax+edi*2] jnz .loop .done pop edi pop esi pop ebp pop ebx ret 8 ;*---------------------------------------------------------------------- ;* ;* rt_map2cols_asm ;* ;* ecx = hx ;* edx = sx ;* [esp+4] = yl ;* [esp+8] = yh ;* ;*---------------------------------------------------------------------- GLOBAL @rt_map2cols_asm@16 GLOBAL _rt_map2cols_asm GLOBAL rt_map2cols_asm align 16 rt_map2cols_asm: _rt_map2cols_asm: pop eax mov edx,[esp+4*3] mov ecx,[esp+4*2] push edx push ecx mov ecx,[esp+4*2] mov edx,[esp+4*3] push eax @rt_map2cols_asm@16: mov eax,[esp+4] push ebx mov ebx,[esp+12] push ebp push esi sub ebx, eax push edi js near .done lea edi,[eax*4] mov eax,[columnofs] mov esi,[dc_colormap] ; esi = colormap inc ebx ; ebx = count mov eax,[eax+edx*4] mov edx,[ylookup] lea ebp,[dc_temp+ecx+edi] ; ebp = source mov ecx,[edx+edi] mov edi,[dc_pitch] ; edi = pitch add eax,ecx ; eax = dest xor ecx,ecx xor edx,edx shr ebx,1 jnc .even mov dl,[ebp] mov cl,[ebp+1] add ebp,4 mov dl,[esi+edx] mov cl,[esi+ecx] mov [eax],dl mov [eax+1],cl add eax,edi .even and ebx,ebx jz .done .loop mov dl,[ebp] mov cl,[ebp+1] mov dl,[esi+edx] mov cl,[esi+ecx] mov [eax],dl mov dl,[ebp+4] mov [eax+1],cl mov cl,[ebp+5] add ebp,8 mov dl,[esi+edx] mov cl,[esi+ecx] mov [eax+edi],dl mov [eax+edi+1],cl dec ebx lea eax,[eax+edi*2] jnz .loop .done pop edi pop esi pop ebp pop ebx ret 8 ;*---------------------------------------------------------------------- ;* ;* rt_map4cols_asm ;* ;* ecx = sx ;* edx = yl ;* [esp+4] = yh ;* ;*---------------------------------------------------------------------- GLOBAL @rt_map4cols_asm1@12 GLOBAL _rt_map4cols_asm1 GLOBAL rt_map4cols_asm1 align 16 rt_map4cols_asm1: _rt_map4cols_asm1: pop eax mov ecx,[esp+8] mov edx,[esp+4] push ecx mov ecx,[esp+4] push eax @rt_map4cols_asm1@12: push ebx mov ebx,[esp+8] push ebp push esi sub ebx,edx push edi js near .done mov eax,[columnofs] mov esi,[dc_colormap] ; esi = colormap shl edx,2 mov eax,[eax+ecx*4] mov ecx,[ylookup] inc ebx ; ebx = count mov edi,[ecx+edx] lea ebp,[dc_temp+edx] ; ebp = source add eax,edi ; eax = dest mov edi,[dc_pitch] ; edi = pitch xor ecx,ecx xor edx,edx shr ebx,1 jnc .even mov dl,[ebp] mov cl,[ebp+1] add ebp,4 mov dl,[esi+edx] mov cl,[esi+ecx] mov [eax],dl mov [eax+1],cl mov dl,[ebp-1] mov cl,[ebp-2] mov dl,[esi+edx] mov cl,[esi+ecx] mov [eax+2],dl mov [eax+3],cl add eax,edi .even and ebx,ebx jz .done .loop mov dl,[ebp] mov cl,[ebp+1] add ebp,8 mov dl,[esi+edx] mov cl,[esi+ecx] mov [eax],dl mov [eax+1],cl mov dl,[ebp-6] mov cl,[ebp-5] mov dl,[esi+edx] mov cl,[esi+ecx] mov [eax+2],dl mov [eax+3],cl mov dl,[ebp-4] mov cl,[ebp-3] mov dl,[esi+edx] mov cl,[esi+ecx] mov [eax+edi],dl mov [eax+edi+1],cl mov dl,[ebp-2] mov cl,[ebp-1] mov dl,[esi+edx] mov cl,[esi+ecx] mov [eax+edi+2],dl mov [eax+edi+3],cl lea eax,[eax+edi*2] dec ebx jnz .loop .done pop edi pop esi pop ebp pop ebx ret 4 GLOBAL @rt_map4cols_asm2@12 GLOBAL _rt_map4cols_asm2 GLOBAL rt_map4cols_asm2 align 16 rt_map4cols_asm2: _rt_map4cols_asm2: pop eax mov ecx,[esp+8] mov edx,[esp+4] push ecx mov ecx,[esp+4] push eax @rt_map4cols_asm2@12: push ebx mov ebx,[esp+8] push ebp push esi sub ebx,edx push edi js near .done mov eax,[columnofs] mov esi,[dc_colormap] ; esi = colormap shl edx,2 mov eax,[eax+ecx*4] mov ecx,[ylookup] inc ebx ; ebx = count mov edi,[ecx+edx] lea ebp,[dc_temp+edx] ; ebp = source add eax,edi ; eax = dest mov edi,[dc_pitch] ; edi = pitch xor ecx,ecx xor edx,edx shr ebx,1 jnc .even mov dl,[ebp] mov cl,[ebp+1] add ebp,4 mov dl,[esi+edx] mov cl,[esi+ecx] mov [eax],dl mov [eax+1],cl mov dl,[ebp-1] mov cl,[ebp-2] mov dl,[esi+edx] mov cl,[esi+ecx] mov [eax+2],dl mov [eax+3],cl add eax,edi .even and ebx,ebx jz .done .loop mov dl,[ebp+3] mov ch,[esi+edx] mov dl,[ebp+2] mov cl,[esi+edx] shl ecx,16 mov dl,[ebp+1] mov ch,[esi+edx] mov dl,[ebp] mov cl,[esi+edx] mov [eax],ecx add eax,edi mov dl,[ebp+7] mov ch,[esi+edx] mov dl,[ebp+6] mov cl,[esi+edx] shl ecx,16 mov dl,[ebp+5] mov ch,[esi+edx] mov dl,[ebp+4] mov cl,[esi+edx] mov [eax],ecx add eax,edi add ebp,8 dec ebx jnz .loop .done pop edi pop esi pop ebp pop ebx ret 4