;XTreem.asm - by Sean Palmer
; with a little help from:
;   Matt Pritchard, Bas Van Gaalen, Michael Abrash, Keld Hansen, Bresenham
; public domain
; credit me if you use any of this

ideal          ;needs TASM to assemble it.  I loathe MASM...
model tpascal  ;my language of choice.  Change at your own risk!
p386n          ;needs a 386 to run. Optimized for a 486 with VLB video.
               ;Much of it also optimized for a Pentium, I hope...
               ;Hard to tell since I don't have one!
               ;Works in protected mode too

;Be careful.. much of this doesn't have a lot of bounds checking.
;Give it valid numbers when at all possible!
;Clipping may come in a future version.

extrn yTab[564]:word  ;table for scan line start offsets

extrn segA000:word    ;video segment descriptor

extrn xRes:word       ;width of physical screen in pixels
extrn yRes:word       ;height of physical screen in pixels
extrn lxRes:word      ;width of virtual screen in pixels
extrn lyRes:word      ;height of virtual screen in pixels

extrn lxBytes:word    ;width of virtual screen in bytes
extrn pgBytes:word    ;size of a page in bytes
extrn pgStart:dword   ;current write page offset in bytes
extrn pgShown:dword   ;current display page offset in bytes

extrn oldMode:byte

attrPort   =3C0h      ;VGA Attribute Controller port
miscOutPort=3C2h      ;VGA Misc Output Register
seqPort    =3C4h      ;VGA Sequencer port
gcPort     =3CEh      ;VGA Graphics Controller port
crtcPort   =3D4h      ;CRT Controller port
input1port =3DAh      ;CRT Input port #1



codeseg

dotMask     db 01h,02h,04h,08h
lfPlaneMask db 0Fh,0Eh,0Ch,08h
rtPlaneMask db 01h,03h,07h,0Fh


public clear          ;clear screen 0

proc clear far color:byte
  mov dx,seqPort      ;select all planes with map mask reg
  les di,[pgStart]    ;set video segment
  mov ax,0F02h
  mov bl,[color]      ;gonna fill eax with color
  out dx,ax
  mov bh,bl
  mov cx,[pgBytes]
  shrd eax,ebx,16
  shr cx,2            ;dword count
  cld
  mov ax,bx
  rep stosd
  ret
  endp


public textureColumn
proc textureColumn far x:word,y:word,y2:word,                 \ ;dst coords (y2>=y)
                       tx:dword,ty:dword,tx2:dword,ty2:dword, \ ;src coords
                       tp:dword,tw:word   ;tp=^src bitmap, tw=src bitmap width

  mov bx,[x]
  les di,[pgStart]
  mov si,bx
  and si,3
  mov al,2
  shr bx,2
  mov ah,[dotMask+si]
  mov dx,seqPort
  add di,bx
  out dx,ax
  mov si,[y]
  mov cx,[y2]
  sub cx,si
  add si,si
  inc cx
  add di,[yTab+si]
  and ecx,0FFFFh    ;ecx=dest count
  mov eax,[tx2]
  sub eax,[tx]
  cwde
  idiv ecx
  mov [tx2],eax     ;tx2=speedX
  mov eax,[ty2]
  sub eax,[ty]
  cwde
  idiv ecx
  mov [ty2],eax     ;ty2=speedY
  mov bx,[lxBytes]
  mov ax,[word high ty]
  mul bx
  add ax,[word high tx]
  mov [x],bx
  xor si,si
  lds si,[tp]
  add si,ax
  ret
  endp


;takes about .008 sec to move a 320x200 page on my 486DX2/66 w/VLB video
;plus you get the benefits of your drawing routines never having to
;mess with any VGA registers!
;Your memory page should be laid out in a planar format, with sequential
;planes back-to-back
;This COULD be faster than direct screen writes depending on what kind of
;drawing you're doing and what kind of video you have. But it usually isn't.

public memBlt

proc memBlt far p:dword       ;blit a memory page to video ram
  mov ax,ds
  mov dx,seqPort
  mov fs,ax
  xor edi,edi
  mov ax,0102h
  les di,[pgStart]
  out dx,ax
  movzx ebx,[pgBytes]
  inc dx
  xor esi,esi
  shr ebx,2                ;dwords
  lds si,[p]
  mov ecx,ebx
  rep movs [dword es:edi],[dword ds:esi]
  mov al,0010b
  mov di,[word low fs:pgStart]
  out dx,al
  mov ecx,ebx
  rep movs [dword es:edi],[dword ds:esi]
  mov al,0100b
  mov di,[word low fs:pgStart]
  out dx,al
  mov ecx,ebx
  rep movs [dword es:edi],[dword ds:esi]
  mov al,1000b
  mov di,[word low fs:pgStart]
  out dx,al
  mov ecx,ebx
  rep movs [dword es:edi],[dword ds:esi]
  mov ax,fs
  mov ds,ax
  ret
  endp

if 0

proc memBlt far p:dword       ;blit a packed memory page to unchained video ram
  lgs di,[pgStart]
  mov dx,seqPort
  and esi,0FFFFh
  mov ax,ds
  mov cx,[pgBytes]
  xor esi,esi
  mov fs,ax
  lds si,[p]
  mov ax,0102h
  shr cx,3               ;every 4th byte, loop unrolled once
  out dx,ax
  inc dx
@@L0:
  mov bh,[esi+12]
  mov bl,[esi+8]
  shl ebx,16
  mov bh,[esi+4]
  mov bl,[esi]
  mov [gs:di],ebx
  mov bh,[esi+28]
  mov bl,[esi+24]
  shl ebx,16
  mov bh,[esi+20]
  mov bl,[esi+16]
  add si,32
  mov [gs:di+4],ebx
  add di,8
  dec cx
  jnz @@L0
  mov di,[word low fs:pgStart]
  mov al,2
  movzx esi,[word low p]
  mov cx,[fs:pgBytes]
  inc esi
  out dx,al
  shr cx,3               ;every 4th byte, loop unrolled once
@@L1:
  mov bh,[esi+12]
  mov bl,[esi+8]
  shl ebx,16
  mov bh,[esi+4]
  mov bl,[esi]
  mov [gs:di],ebx
  mov bh,[esi+28]
  mov bl,[esi+24]
  shl ebx,16
  mov bh,[esi+20]
  mov bl,[esi+16]
  add si,32
  mov [gs:di+4],ebx
  add di,8
  dec cx
  jnz @@L1
  mov di,[word low fs:pgStart]
  mov al,4
  movzx esi,[word low p]
  mov cx,[fs:pgBytes]
  add esi,2
  out dx,al
  shr cx,3               ;every 4th byte, loop unrolled once
@@L2:
  mov bh,[esi+12]
  mov bl,[esi+8]
  shl ebx,16
  mov bh,[esi+4]
  mov bl,[esi]
  mov [gs:di],ebx
  mov bh,[esi+28]
  mov bl,[esi+24]
  shl ebx,16
  mov bh,[esi+20]
  mov bl,[esi+16]
  add si,32
  mov [gs:di+4],ebx
  add di,8
  dec cx
  jnz @@L2
  mov di,[word low fs:pgStart]
  mov al,8
  movzx esi,[word low p]
  mov cx,[fs:pgBytes]
  add esi,3
  out dx,al
  shr cx,3               ;every 4th byte, loop unrolled once
@@L3:
  mov bh,[esi+12]
  mov bl,[esi+8]
  shl ebx,16
  mov bh,[esi+4]
  mov bl,[esi]
  mov [gs:di],ebx
  mov bh,[esi+28]
  mov bl,[esi+24]
  shl ebx,16
  mov bh,[esi+20]
  mov bl,[esi+16]
  add si,32
  mov [gs:di+4],ebx
  add di,8
  dec cx
  jnz @@L3
  mov ax,fs
  mov ds,ax
  ret
  endp

endif


public plot

proc plot x:word,y:word,color:byte
  les di,[pgStart]
  mov si,[x]
  mov bx,[y]
  mov cx,si
  add bx,bx
  and si,3
  shr cx,2
  mov dx,seqPort
  mov al,2              ;map mask register
  mov ah,[dotMask+si]
  add di,cx
  out dx,ax             ;set map mask
  add di,[yTab+bx]      ;di=byte offset
  mov al,[color]
  mov [es:di],al
  ret
  endp


public scrn

proc scrn far x:word,y:word   ;returns color in al
  les di,[pgStart]
  mov si,[x]
  mov bx,[y]
  mov ax,si
  add bx,bx
  mov ah,al
  shr si,2
  mov al,4              ;read map select register in graphics controller
  add di,si
  and ah,3
  add di,[yTab+bx]      ;di=byte offset
  mov dx,gcPort
  mov es,[segA000]
  out dx,ax             ;set read map select
  mov al,[es:di]
  ret
  endp


public hLin

proc hLin x:word,x2:word,y:word,color:byte
  mov si,[y]
  les di,[pgStart]
  add si,si
  mov bx,[x]
  mov cx,[x2]
  cmp bx,cx
  jl @@NOSWAP
    xchg bx,cx
  @@NOSWAP:
  add di,[yTab+si]      ;di=base of scan line
  mov si,bx
  shr bx,2
  and si,3
  add di,bx             ;di=offset into video buffer
  mov ah,[lfPlaneMask+si]
  mov si,cx
  mov al,2              ;map mask index
  and si,3
  shr cx,2
  sub cx,bx             ;width in bytes-1
  mov dx,seqPort
  mov bh,[rtPlaneMask+si]
  mov bl,[color]
  jcxz @@ONE

  inc cx

;  cmp ah,bh
;  je @@MIDDLE
    cmp bh,0Fh
    je @@LEFT
      xchg ah,bh
      dec cx
      add di,cx
      out dx,ax
      mov ah,bh
      mov [es:di],bl
      sub di,cx
    @@LEFT:
    cmp ah,0Fh
    je @@MIDDLE
      out dx,ax
      dec cx
      mov [es:di],bl
      mov ah,0Fh
      inc di
  @@MIDDLE:
  out dx,ax
  cmp cx,12
  jae @@DWORDFILL
  cmp cx,4
  jae @@WORDFILL
  jcxz @@EXIT
  mov [es:di],bl
  cmp cx,1
  je @@EXIT
  mov [es:di+1],bl
  cmp cx,2
  je @@EXIT
  mov [es:di+2],bl
@@EXIT:
  ret

@@ONE:
  and ah,bh
  out dx,ax
  mov [es:di],bl
  ret

@@DWORDFILL:
  cld
  mov bh,bl
  mov si,cx
  shrd eax,ebx,16
  mov ax,bx
  xor cx,cx
  sub cx,di
  and ecx,3
  sub si,cx
  rep stosb           ;align to dword
  mov cx,si
  ror ecx,2
  rep stosd
  rol ecx,2
  rep stosb           ;leftover bytes?
  ret

@@WORDFILL:
  cld
  mov ah,bl
  mov al,bl
  test di,1
  jz @@WORDALIGNED
    mov [es:di],al
    dec cx
    inc di
  @@WORDALIGNED:
  shr cx,1
  rep stosw           ;fill middle words
  jnc @@EXIT
  mov [es:di],bl
  ret
  endp


public vLin

proc vLin far x:word,y:word,y2:word,color:byte
  xor ebx,ebx
  mov si,[x]
  mov bx,si
  and bx,3
  mov di,[word low pgStart]
  mov al,02h
  mov ah,[dotMask+bx]
  mov bx,[y]
  add bx,bx
  shr si,2
  add di,si
  mov dx,seqPort
  add di,[yTab+bx]
  mov cx,[y2]
  out dx,ax
  mov bx,di
  mov di,[lxBytes]
  sub cx,[y]
  jnc @@OK
    neg cx
    neg di
  @@OK:
  mov al,[color]
  push ds
  mov ds,[word high pgStart]
  inc cx
  mov dx,cx
  shr cx,2
  and dx,3
  jz @@X
  mov [bx],al
  add bx,di
  dec dx
  jz @@X
  mov [bx],al
  add bx,di
  dec dx
  jz @@X
  mov [bx],al
  add bx,di
 @@X:
  jcxz @@XIT
  mov si,di
  add si,si
  xor edx,edx
  mov dx,si
  add dx,di
 @@DOLINE:               ;unrolled * 4
  mov [bx],al
  mov [bx+di],al
  mov [bx+si],al
  mov [ebx+edx],al       ;it's still worth the prefix to get rid of the AGI's
  add bx,si
  add bx,si
  dec cx
  jnz @@DOLINE
  pop ds
@@XIT:
  ret
  endp


public pane

proc pane x:word,y:word,x2:word,y2:word,color:byte
local h:word,o:word
  mov ax,[y2]
  mov bx,[y]
  sub ax,bx
  jc @@EXIT
  inc ax
  mov [h],ax
  les di,[pgStart]
  cld
  mov dx,seqPort
  mov al,2
  out dx,al               ;set up sequencer map mask
  mov ax,[lxBytes]
  mul bx
  add di,ax               ;di=base of scan line
  mov bx,[x]
  mov dx,[x2]
  mov si,bx
  and si,3
  shr bx,2                ;bx=horiz offset in bytes
  mov al,[lfPlaneMask+si]
  mov si,dx
  and si,3
  shr dx,2
  sub dx,bx               ;dx=width in bytes
  jc @@EXIT
  add di,bx               ;di=offset into video buffer
  mov bh,[rtPlaneMask+si]
  mov [o],di
  mov cx,dx
  mov dx,seqport+1
  test cx,cx
  jnz @@LEFT
  and al,bh               ;combine left & right bitmasks
  jmp @@RIGHT
@@LEFT:
  inc cx
  test al,al
  jz @@CENTER2
  out dx,al
  mov al,[color]
  mov si,[h]
@@LEFTL:
  mov [es:di],al
  add di,[lxBytes]
  dec si
  jnz @@LEFTL
  mov di,[o]
  inc di
@@CENTER2:
  dec cx
  add [o],cx
@@CENTER:
  jcxz @@EXIT
  cmp bh,0Fh
  je @@C2
    dec cx
    jz @@RIGHT2
@@C2:
  mov al,0Fh
  out dx,al
  mov al,[color]
  mov ah,al
  mov si,[h]
@@CENTERL:
  push cx
  shr cx,1
  rep stosw
  jnc @@OVER
    mov [es:di],al
    inc di
  @@OVER:
  pop cx
  add di,[lxBytes]
  sub di,cx
  dec si
  jnz @@CENTERL
  mov di,[o]
  cmp bh,3
  je @@EXIT
@@RIGHT2:
  mov al,bh
@@RIGHT:
  out dx,al
  mov al,[color]
  mov si,[h]
  mov bx,[lxBytes]
@@RIGHTL:
  mov [es:di],al
  add di,bx
  dec si
  jnz @@RIGHTL
@@EXIT:
  ret
  endp

if 0

public line

proc line far x:word,y:word,x2:word,y2:word,color:byte
  mov ax,[x]
  mov bx,[x2]
  mov cx,[y]
  mov dx,[y2]
  cmp ax,bx      ;make sure x>x2
  jae @@NOSWAP
    xchg ax,bx
    xchg cx,dx
  @@NOSWAP:
  sub bx,ax
  sub dx,cx
  shl cx,1
  les di,[pgStart]
;  add di,[yTab+cx]  ;illegal index mode!
  mov cl,al
  and cl,3
  shr ax,2
  add di,ax
  mov ax,0102h
  shl ah,cl

  mov bl,[color]
  push ds
  push bp

  @@HLOOP:
    out dx,ax
    dec cx
    mov [di],bl
    jz @@X
    shr ah,1
    jnz @@SAMEBYTE
      dec di
      mov ah,8
    @@SAMEBYTE:
    add bp,dx
    jnc @@HLOOP
    add di,si
    jmp @@HLOOP

    out dx,ax
  @@VLOOP:
    dec cx
    mov [di],bl
    jz @@X
    add di,si
    add bp,dx
    jnc @@VLOOP
    shr ah,1
    jnz @@VSAMEBYTE
      dec di
      mov ah,8
    @@VSAMEBYTE:
    out dx,ax
    jmp @@VLOOP

  @@X:
  pop bp
  pop ds
  ret
  endp

else

extrn line:far

endif


public curve

proc curve far x1:word,y1:word,x2:word,y2:word,x3:word,y3:word,color:word,steps:word
local fx:dword,ex:dword,fy:dword,ey:dword

  mov ax,[x3]     ;adjust center point out *2 so curve fits the point
  mov bx,[y3]
  add ax,[x1]
  add bx,[y1]
  shr ax,1
  shr bx,1
  sub ax,[x2]
  sub bx,[y2]
  sub [x2],ax
  sub [y2],bx

  movzx esi,[steps]
  cmp si,2
  jae @@OK
    mov si,2
  @@OK:
  cmp si,4000h
  jbe @@OK2
    mov si,4000h
  @@OK2:
  mov edi,esi

  mov ax,[x2]     ;pre-calc important quantities
  sub ax,[x1]
  shl eax,17
  mov edx,eax
  sar edx,31
  idiv edi
  mov [ex],eax

  mov ax,[y2]
  sub ax,[y1]
  shl eax,17
  mov edx,eax
  sar edx,31
  idiv edi
  mov [ey],eax

  imul edi,edi

  mov ax,[x3]
  sub ax,[x2]
  sub ax,[x2]
  add ax,[x1]
  shl eax,16
  mov edx,eax
  sar edx,31
  idiv edi
  mov [fx],eax

  mov ax,[y3]
  sub ax,[y2]
  sub ax,[y2]
  add ax,[y1]
  shl eax,16
  mov edx,eax
  sar edx,31
  idiv edi
  mov [fy],eax

  dec esi
  jz @@X

  @@LOOP:
    push esi

    push [x3]
    push [y3]

    mov eax,[fx]
    imul eax,esi
    add eax,[ex]
    imul eax,esi
    shr eax,16
    add ax,[x1]
    push ax
    mov [x3],ax

    mov eax,[fy]
    imul eax,esi
    add eax,[ey]
    imul eax,esi
    shr eax,16
    add ax,[y1]
    push ax
    mov [y3],ax

    push [word color]
    call line
    pop esi
    dec si
    jnz @@LOOP

  push [x3]
  push [y3]
  push [x1]
  push [y1]
  push [word color]
  call line
@@X:
  ret
  endp


public circle      ;Keld Hansen gets credit for this cool Bresenham's circle
                   ;implementation. I've tweaked it a bit for speed.

macro inplot ;expects cx=x,dx=y,ah=color,es=vidSeg,sequencer set to map mask reg
             ;destroys al,cx,dx
  push bx
  push si
  mov bx,dx
  add bx,bx
  mov si,cx
  shr cx,2
  and si,3
  mov bx,[yTab+bx]      ;di=byte offset
  mov al,[dotMask+si]
  mov dx,seqPort+1
  add bx,cx
  out dx,al             ;set map mask
  add bx,[word low pgStart]  ;this AGI will give time for sequencer to respond
  pop si
  mov [es:bx],ah
  pop bx
  endm

proc circle far xc:word,yc:word,r:word,c:byte
  mov dx,seqPort
  mov al,2                    ;map mask register
  mov es,[word high pgStart]
  out dx,al
  xor si,si                   ;DeltaX := 0
  mov di,[r]                  ;DeltaY := Radius
  mov dx,3                    ;D := 3-2*Radius
  sub dx,di
  sub dx,di
  push bp
  mov ah,[c]
  mov bx,[xc]                 ;bx := CenterX
  mov bp,[yc]                 ;bp := CenterY
@@L:
  push dx

  lea cx,[bx+si]              ;cx := CenterX+Deltax
  lea dx,[bp+di]              ;dx := CenterY+DeltaY
  inplot

  lea cx,[bx+di]              ;cx := CenterX+DeltaY
  lea dx,[bp+si]              ;dx := CenterY+Deltax
  inplot

  neg di                      ;di := -DeltaY

  lea cx,[bx+si]              ;cx := CenterX+Deltax
  lea dx,[bp+di]              ;dx := CenterY-DeltaY
  inplot

  lea cx,[bx+di]              ;cx := CenterX-DeltaY
  lea dx,[bp+si]              ;dx := CenterY+Deltax
  inplot

  neg si                      ;si := -Deltax

  lea cx,[bx+di]              ;cx := CenterX-DeltaY
  lea dx,[bp+si]              ;dx := CenterY-Deltax
  inplot

  lea cx,[bx+si]              ;cx := CenterX-Deltax
  lea dx,[bp+di]              ;dx := CenterY-DeltaY
  inplot

  neg di                      ;di := DeltaY

  lea cx,[bx+si]              ;cx := CenterX-Deltax
  lea dx,[bp+di]              ;dx := CenterY+DeltaY
  inplot

  lea cx,[bx+di]              ;cx := CenterX+DeltaY
  lea dx,[bp+si]              ;dx := CenterY-Deltax
  inplot

  pop dx                      ;Retrieve saved reg

  push ax                     ;preserve color
  mov ax,si
  mov cx,6
  test dh,dh                  ;if D < 0
  js @@DLT0
    add cx,4
    sub ax,di                   ;D+=4*(Deltax-DeltaY)+10
    dec di                      ;DeltaY--
  @@DLT0:                     ;else
  lea ax,[eax*4+ecx]            ;D+=4*Deltax+6
  add dx,ax
  inc si                      ;Deltax++
  pop ax
  cmp si,di
  jle @@L
  pop bp
  ret
  endp


public calcEdge

proc calcEdge near x:word,y:word,x2:word,y2:word,tbl:dword
  les di,[tbl]
  mov ax,[y]
  mov bx,[y2]
  xor ecx,ecx
  mov cx,[x]
  mov dx,[x2]
  cmp ax,bx
  je @@X
  jl @@NOSWAP
    add di,2
    xchg ax,bx
    xchg cx,dx
  @@NOSWAP:
  lea di,[eax*4+edi]  ;es:di now points to table[minY,dir]
  sub bx,ax
  mov si,ax       ;si=y
  sub dx,cx
  xor al,al
  mov ah,dl
  sar dx,8
  idiv bx
  inc bx          ;bx=ySteps
  cwde            ;eax=xStep
  shl ecx,8       ;ecx=x
  mov cl,80h      ;pre-round
  @@L:
    test si,si
    js @@OFFSCREEN
    cmp si,[lyRes]
    jge @@X
     ror ecx,8
    shld edx,ecx,24
    test dx,dx
    js @@LEFT
    cmp dx,[lxRes]
    jge @@RIGHT
    @@OK:
    mov [es:di],cx
    rol ecx,8
    @@OFFSCREEN:
    add ecx,eax
    add di,4
    inc si
    dec bx
    jnz @@L
  @@X:
  ret

    @@RIGHT:
      mov dx,[lxRes]
      dec dx
      jmp @@OK
    @@LEFT:
      xor dx,dx
      jmp @@OK

  endp

public rowList

proc rowList far startY:word,count:word,tbl:dword,color:byte
  cmp [count],0
  jz @@X
  mov ax,[startY]
  shl ax,2
  add [word low tbl],ax
  @@L:
    les di,[tbl]
    mov bx,[es:di]
    mov cx,[es:di+2]
    mov dx,[lxRes]       ;clip against sides
    cmp bx,dx
    jge @@SKIP
    test cx,cx
    js @@SKIP
    test bx,bx
    jns @@LOK
      xor bx,bx
    @@LOK:
    cmp cx,dx
    jl @@ROK
      mov cx,dx
    @@ROK:

    mov si,[startY]       ;this is pretty much just the hlin code
    les di,[pgStart]      ;but replicated inline to avoid function
    add si,si             ;call overhead
    cmp bx,cx
    jl @@NOSWAP
      xchg bx,cx
    @@NOSWAP:
    add di,[yTab+si]      ;di=base of scan line
    mov si,bx
    and si,3
    shr bx,2
    add di,bx             ;di=offset into video buffer
    mov ah,[lfPlaneMask+si]
    mov si,cx
    mov al,2              ;map mask index
    and si,3
    shr cx,2
    sub cx,bx             ;width in bytes-1
    mov dx,seqPort
    mov bh,[rtPlaneMask+si]
    mov bl,[color]
    jcxz @@ONE
    inc cx
    cmp bh,0Fh
    je @@LEFT
      xchg ah,bh
      dec cx
      out dx,ax
      add di,cx
      mov ah,bh
      mov [es:di],bl
      sub di,cx
    @@LEFT:
    cmp ah,0Fh
    je @@MIDDLE
      out dx,ax
      dec cx
      mov [es:di],bl
      mov ah,0Fh
      inc di
    @@MIDDLE:
    out dx,ax
;    cmp cx,12
;    jae @@DWORDFILL  ;dwords end up slowing it down in most cases
    cmp cx,4
    jae @@WORDFILL
    jcxz @@SKIP
    mov [es:di],bl
    cmp cx,1
    je @@SKIP
    mov [es:di+1],bl
    cmp cx,2
    je @@SKIP
    mov [es:di+2],bl
    jmp @@SKIP

  @@ONE:
    and ah,bh
    out dx,ax
    mov [es:di],bl
    jmp @@SKIP

;  @@DWORDFILL:
;    cld
;    mov bh,bl
;    shrd eax,ebx,16
;    mov ax,bx
;    mov si,cx
;    xor cx,cx
;    sub cx,di
;    and ecx,3
;    sub si,cx
;    rep stosb           ;align to dword
;    mov cx,si
;    ror ecx,2
;    rep stosd
;    rol ecx,2
;    rep stosb           ;leftover bytes?
;    jmp @@SKIP

  @@WORDFILL:
    cld
    mov ah,bl
    mov al,bl
    test di,1
    jz @@WORDALIGNED
      mov [es:di],al
      dec cx
      inc di
    @@WORDALIGNED:
    shr cx,1
    rep stosw           ;fill middle words
    jnc @@SKIP
    mov [es:di],bl

  @@SKIP:
    add [word low tbl],4
    inc [startY]
    dec [count]
    jnz @@L
  @@X:
  ret
  endp

;------------------------------------------------------------------------

;drawSprite takes a pointer to a custom sprite data structure
;which must be produced in this format:
;
; word    width in bytes
; word    height of sprite data (must match # of 0 commands in each plane)
; word    horz offset of sprite center from upper left
; word    vert " "
;
;this is followed by 4 sets of these commands, one for each plane:
;
;         0  move to start of next row
;   1...127  draw this many bytes that follow in sprite data
;  -1..-128  skip over this many screen bytes
;
;All rows must be accounted for in each plane, but rows don't have
;to have data for the entire width of the sprite.
;This allows for oddly-shaped/semi-transparent objects like text
;as well as solid rectangular objects. As written, it's fast
;enough that a separate routine to draw non-transparent images
;is unnecessary (virtually no speed gain)

;This format is peculiar because each set of commands describes how
;to draw one plane of the sprite. Planes are left to right, and are
;interlaced. (yes unchained modes are peculiar.)  This is the fastest
;known way to get sprite data onto the screen so if you want speed,
;figure this format out and live with it. Or, be slow--see if I care.

;for a 12x2 object, here is the order it is stored and gets drawn:
;(letters are planes, numbers are byte index into the plane)
; a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3
; a4 b4 c4 d4 a5 b5 c5 d5 a6 b6 c6 d6

;let's say you're trying to draw this simple shape:
;       ##@
;      #@#@#
;       @# #

;plane 12341

;it's 5x3. The first plane has two bytes, the rest one.
;so the resulting file needed to draw this shape would consist of
;the following hex bytes (# is color 33 hex, @ is color 44 hex)

; width                  05 00
; height                 03 00
; h ofs                  02 00
; v ofs                  01 00
; plane one   row one    00            first and fifth column data
;             row two    02 33 33 00
;             row three  FF 01 33 00   (ff=-1, or 1 transparent byte)
; plane two   row one    01 33 00      second column data
;             row two    01 44 00
;             row three  01 44 00
; plane three row one    01 33 00      third column data
;             row two    01 33 00
;             row three  01 33 00
; plane four  row one    01 44 00      fourth column data
;             row two    01 44 00
;             row three  00

;This example is a contrived (and small) one. Large ones turn out
;much better in actual practice.

;the sprite, when drawn, is not clipped. If drawing partially offscreen
;it is expected that you will have extra virtual display space surrounding
;the display window.

public drawSprite

proc drawSprite far sprite:dword,x:word,y:word
  cld

  mov ax,[x]
  mov cx,ax
  shr ax,2           ;bytes
  les bx,[pgStart]   ;calc offset using active page
  mov di,[y]
  add di,di
  add bx,[yTab+di]
  add bx,ax          ;bx=offset into video ram of top left of sprite

  mov si,[lxBytes]
  shl esi,16         ;esihi=bytes per scan line

  push ds
  lds si,[sprite]
  push ebp

  shl ebx,16         ;ebxhi=dst reloader
  mov ax,[si+2]      ;ax=sprite height
  shl eax,16         ;eaxhi=sprite height reloader
  and cl,3
  mov ax,0102h
  shl ah,cl          ;ah=current plane
  add si,8           ;skip size/origin data
  mov edx,30000h+seqPort   ;edxhi=planes of sprite data to go-1
  @@PLANE:
    out dx,ax        ;set sequencer map mask
    shld edi,ebx,16  ;di=dst
    shld ebp,eax,16  ;bp=sprite height
    mov bx,di        ;save dst
    @@ROW:
      shld ecx,esi,16
      add bx,cx
      db 0A9h        ;test ax,imm16 instr to skip next 2 byte instr
      @@SKIP:
        sub di,cx    ;bump di by -count (2 byte instr)
      @@DATA:
        movsx cx,[si]
        inc si
        test cx,cx
        js @@SKIP
        jz @@NEXTROW
        movzx ecx,cx ;zero ecxhi
        ror ecx,2    ;save lo 2 bits in ecxhi
        rep movsd    ;fast dword move..doesn't affect ecxhi
        rol ecx,2    ;retrieve saved bits
        jz @@DATA    ;anything to do?
        rep movsb    ;finish off remaining bytes
        jmp @@DATA
      @@NEXTROW:
      mov di,bx
      dec bp
      jnz @@ROW
    shl ah,1
    test ah,10h        ;did plane wrap to next byte?
    jz @@NOPLANEWRAP
      add ebx,10000h
      mov ah,1
    @@NOPLANEWRAP:
    sub edx,10000h     ;any more planes?
    jnc @@PLANE
  pop ebp
  pop ds
  ret
  endp

;drawTile takes a pointer to a special record format too. It's:
;  word  width in bytes
;  word  height
;followed by 4 planes of packed bytes (first is leftmost)
;this is a little faster than drawSprite for non-transparent objects
;this isn't clipped either. See drawSprite for details.

public drawTile

proc drawTile far tile:dword,x:word,y:word
  cld
  mov ax,[x]
  mov cx,ax
  shr ax,2           ;bytes
  les si,[pgStart]   ;calc offset using active page
  mov bx,[y]
  shl bx,1
  add si,[yTab+bx]
  add si,ax          ;si=offset into video ram of top left of tile
  shl esi,16         ;esihi=dest reloader

  mov bx,[lxBytes]

  push ds
  lds si,[sprite]
  push ebp

  mov ax,[si]        ;width in bytes
  sub bx,ax
  rol ebx,16
  mov bx,ax
  rol ebx,16         ;ebxhi=width reloader, bx=offset to next scan line

  mov ax,[si+2]      ;height
  add si,4           ;skip size/origin data
  shl eax,16         ;eaxhi=tile height reloader
  and cl,3
  mov ax,0102h
  mov edx,30000h+seqPort   ;edxhi=planes of sprite data to go-1
  shl ah,cl          ;ah=current plane
  @@PLANE:
    out dx,ax        ;set sequencer map mask
    shld edi,esi,16  ;di=dest
    shld ebp,eax,16  ;bp=height
    xor cx,cx
    @@ROW:
      shld ecx,ebx,16 ;reload count
      ror ecx,2
      rep movsd
      rol ecx,2
      rep movsb
      add di,bx
      dec bp
      jnz @@ROW
    shl ah,1
    test ah,10h        ;did plane wrap to next byte?
    jz @@NOPLANEWRAP
      add esi,10000h
      mov ah,1
    @@NOPLANEWRAP:
    sub edx,10000h     ;any more planes?
    jnc @@PLANE
  pop ebp
  pop ds
  ret
  endp

if 0


public CopyRect

proc CopyRect far srcStartX:word,srcStartY:word,   \
                  srcEndX:word,srcEndY:word,       \
                  dstStartX:word,dstStartY:word,       \
                  srcPageBase:word,dstPageBase:word, \
                  srcBmpWidth:word,dstBmpWidth:word
local srcNext:word,dstNext:word, \
      height:word,rectAddrWidth:word
  push ds

  cld
  mov dx,gcPort       ;set the bit mask to select all bits
  mov ax,8            ; from the latches and none from
  out dx,ax           ; the CPU, so that we can write the
                      ; latch contents directly to memory
  mov es,[segA000]
  mov ax,[dstBmpWidth]
  shr ax,2             ;convert to width in addresses
  mul [dstStartY]     ;top dst rect scan line
  mov di,[dstStartX]
  shr di,2             ;X/4 = offset of first dst rect pixel in scan line
  add di,ax            ;offset of first dst rect pixel in page
  add di,[dstPageBase] ;offset of first dst rect pixel in display memory
  mov ax,[srcBmpWidth]
  shr ax,2             ;convert to width in addresses
  mul [srcStartY]   ;top src rect scan line
  mov si,[srcStartX]
  mov bx,si
  shr si,2             ;X/4 = offset of first src rect pixel in scan line
  add si,ax            ;offset of first src rect pixel in page
  add si,[srcPageBase]  ;offset of first src rect pixel in display memory
  and bx,0003h                   ;look up left edge plane mask to clip
  mov ah,[lfPlaneMask+bx]
  mov bx,[srcEndX]
  and bx,0003h                   ;look up right edge plane mask to clip
  mov al,[rtPlaneMask+bx]
  mov bx,ax                      ;put the masks in BX

  mov cx,[srcEndX]   ;calculate # of addresses across
  mov ax,[srcStartX] ; rect
  cmp cx,ax
  jle @@CopyDone        ;skip if 0 or negative width
  dec cx
  and ax,not 011b
  sub cx,ax
  shr cx,2       ;# of addresses across rectangle to copy - 1
  jnz @@MasksSet ;there's more than one address to draw
  and bh,bl      ;there's only one address, so combine the left
                 ; and right edge clip masks
@@MasksSet:
  mov ax,[srcEndY]
  sub ax,[srcStartY]  ;AX = height of rectangle
  jle @@CopyDone         ;skip if 0 or negative height
  mov [Height],ax
  mov ax,[dstBmpWidth]
  shr ax,2    ;convert to width in addresses
  sub ax,cx   ;distance from end of one dst scan line to
  dec ax      ;start of next
  mov [dstNext],ax
  mov ax,[srcBmpWidth]
  shr ax,2    ;convert to width in addresses
  sub ax,cx   ;distance from end of one src scan line to
  dec ax      ; start of next
  mov [srcNext],ax
  mov [RectAddrWidth],cx ;remember width in addresses - 1
  mov dx,seqPort ;point to Sequence Controller Data reg
  mov al,2       ;map mask reg
  out dx,al
  inc dx      ;primed and ready
  mov ax,es   ;DS=ES=screen segment for MOVS
  mov ds,ax
@@CopyRowsLoop:
  mov cx,[RectAddrWidth] ;width across - 1
  mov al,bh   ;put left-edge clip mask in AL
  out dx,al   ;set the left-edge plane (clip) mask
  movsb       ;copy the left edge (pixels go through latches)
  dec cx      ;count off left edge address
  js @@CopyLoopBottom ;that's the only address
  jz @@DoRightEdge ;there are only two addresses
  mov al,00fh ;middle addresses are drawn 4 pixels at a pop
  out dx,al   ;set the middle pixel mask to no clip
  rep movsb   ;draw the middle addresses four pixels apiece
              ; (pixels copied through latches)
@@DoRightEdge:
  mov al,bl   ;put right-edge clip mask in AL
  out dx,al   ;set the right-edge plane (clip) mask
  movsb       ;draw the right edge (pixels copied through latches)
@@CopyLoopBottom:
  add si,[srcNext] ;point to the start of next src & dst lines
  add di,[dstNext]
  dec [Height]     ;count down scan lines
  jnz @@CopyRowsLoop
@@CopyDone:
  mov dx,gcPort+1     ;restore the bit mask to its default,
  mov al,0ffh         ; which selects all bits from the CPU
  out dx,al           ; and none from the latches (the GC
                      ; Index still points to Bit Mask)
  pop ds
  ret
  endp
endif

;--------------------------------------------------------------------

dacReadIndex   =3C7h;
dacWriteIndex  =3C8h;
dacDataRegister=3C9h;


public setColor

proc setColor far color:byte,r:byte,g:byte,b:byte
 mov dx,dacWriteIndex
 mov al,[color]
 out dx,al
 inc dx
 mov al,[r]
 out dx,al
 mov al,[g]
 out dx,al
 mov al,[b]
 out dx,al
 ret
 endp


public getColor

proc getColor far color:byte  ;returns longint color in ax,dx
 mov dx,dacReadIndex
 mov al,[color]
 out dx,al
 add dx,2
 in al,dx
 mov bl,al
 in al,dx
 mov ah,al
 in al,dx
 movzx dx,bl
 ret
 endp


public setPalette

proc setPalette far color:byte,num:word,rgb:dword
 mov cx,[num]
 jcxz @@X
 lea cx,[ecx+ecx*2]   ;quick 386 mul by 3
 cld
 mov dx,dacWriteIndex
 mov al,[color]
 out dx,al
 inc dx
 push ds
 lds si,[rgb]
 rep outsb
;@@L: outsb      ;replace "rep outsb" with this if it's too fast
; dec cx         ;for your vga card
; jnz @@L
 pop ds
@@X:
 ret
 endp


public getPalette

proc getPalette far color:byte,num:word,rgb:dword
 mov cx,[num]
 jcxz @@X
 lea cx,[ecx+ecx*2]    ;quick 386 mul by 3
 les di,[rgb]
 cld
 mov dx,dacReadIndex
 mov al,[color]
 out dx,al
 add dx,2
 rep insb
;@@L: insb      ;replace "rep insb" with this if it's too fast
;  dec cx       ;for your vga card
;  jnz @@L
@@X:
 ret
 endp

;--------------------------------------------------------------------


public setWritePage

proc setWritePage far adr:word
  mov ax,[adr]
  mov [word low pgStart],ax
  ret
  endp


public setDisplayPage

proc setDisplayPage far adr:word
  mov bx,[adr]
  mov [word low pgShown],bx    ;save display page start address
;  shr bx,1                     ;set display start address
  mov dx,CRTCPort
  mov al,0Ch                   ;hi byte
  mov ah,bh
  cli
  out dx,ax
  inc ax                       ;lo byte
  mov ah,bl
  out dx,ax
  sti
  mov dx,input1Port
@@L:
  in al,dx
  test al,8
  jz @@L                       ;wait for vertical retrace
  ret
  endp

;scrolls the window so that x,y is the upper-left corner
;x must be <=lxRes-xRes, y must be <=lyRes-yRes

public setWindow

proc setWindow far x:word,y:word
  mov bx,[y]
  shl bx,1
  mov ax,[yTab+bx]
  mov bx,[x]
  mov cl,bl
  shr bx,2
  add bx,[word low pgStart]
  add bx,ax         ;bx=ofs of top left corner
  mov [word low pgShown],bx  ;save display page start address
;  shr bx,1          ;set display start address
  mov dx,CRTCPort
  mov al,0Ch        ;hi byte
  mov ah,bh
  cli
  out dx,ax
  inc ax            ;lo byte
  mov ah,bl
  out dx,ax         ;now we're ready for a retrace to happen.
  sti
  mov dx,input1Port
@@L:
  in al,dx          ;this also resets attribute controller flip/flop
  test al,8
  jz @@L            ;wait for vertical retrace
  mov dx,attrPort
  mov al,33h
  cli
  out dx,al         ;select Pixel Pan reg
  and cl,3
  mov al,cl
  shl al,1          ;for 256 color mode
  out dx,ax
  sti
  ret
  endp


public waitRetrace

proc waitRetrace far
  mov dx,input1Port
@@L1:
  in al,dx
  test al,8
  jnz @@L1 ;wait for no v retrace
@@L2:
  in al,dx
  test al,8
  jz @@L2  ;wait for v retrace
  ret
  endp


;--------------------------------------------------------------------

;sets up the specified version of Mode X.  Allows for a virtual screen which
;can be larger than the displayed screen (which can then be scrolled)
;max scroll-to coord is (lxRes-xRes, lyRes-yRes)

public setModeX

proc setModeX far tblX:dword,tblY:dword,logX:word,logY:word ;returns boolean in al
  mov ah,0Fh
  int 10h
  mov [oldMode],al   ;save old Gr mode
  mov ax,1A00h
  int 10h            ;check for VGA
  cmp al,1Ah
  jne @@BADMODE      ;no VGA Bios
  cmp bl,7
  jb @@BADMODE       ;is VGA or better?
  cmp bl,0FFh
  je @@BADMODE
  les si,[tblX]      ;mode horiz info table ptr
  mov ax,[es:si]
  mov [xRes],ax
  mov cx,[logX]
  and cx,0FFF8h
  cmp cx,ax         ;lxRes must be >=xRes
  jae @@XOK
    mov cx,ax
  @@XOK:
  mov [lXRes],cx
  mov di,cx
  shr di,2
  mov [lxBytes],di
  les si,[tblY]
  mov dx,[es:si]
  mov [yRes],dx
  mov bx,[logY]     ;mode vert info table ptr
  cmp bx,dx         ;lyRes must be >=yRes
  jae @@YOK
    mov bx,dx
  @@YOK:
  mov [lYRes],bx
  mov ax,di
  mul bx            ;calculate needed bytes (lxBytes*lyRes). Must be <=$10000
  mov [pgBytes],ax
  sub ax,1          ;subtract 1 for easy compare
  sbb dx,0
  jnz @@BADMODE

  mov ax,13h
  int 10h                 ;start with mode 13h
  mov dx,seqPort
  mov ax,0604h
  out dx,ax               ;disable Chain 4 mode
  mov ax,0100h
  out dx,ax               ;asynchronous reset
  mov dx,miscOutPort
  mov ax,[es:si+2]        ;put Dot Clock and Lines/Polarity bytes together
  les si,[tblX]
  or ax,[es:si+2]
  out dx,al               ;send them to Misc Output Reg to set Timing/Size
  mov dx,seqPort
  mov ax,0300h
  out dx,ax               ;restart Sequencer

  mov dx,CRTCPort         ;un-write protect regs 0~7
  mov al,11h
  out dx,al
  inc dx
  in al,dx
  and al,7Fh
  out dx,al
  dec dx

  mov ax,0014h            ;turn off dword mode
  out dx,ax
  jmp $+2
  mov ax,0E317h           ;turn on byte mode
  out dx,ax

  ;Now we're in unchained 256-color mode. All that's left is to set the screen size!

  add si,4                ;now send table of horiz CRTC values/regs to the CRTC port
  db 0A8h                 ;test al,imm8 instr to skip next 1-byte instr
  @@L:
    out dx,ax
    lods [word es:si]
    test ax,ax
    jnz @@L
  les si,[tblY]
  add si,4                ;now send table of vert CRTC values/regs to the CRTC port
  db 0A8h                 ;test al,imm8 instr to skip next 1-byte instr
  @@L2:
    out dx,ax
    lods [word es:si]
    test ax,ax
    jnz @@L2

  mov ax,[lXBytes]
  shr ax,1             ;Offset Value = lXBytes / 2
  mov ah,al            ;Switch format for out
  mov al,13h           ;CRTC Offset Register Index
  out dx,ax            ;Set VGA CRTC Offset Reg

  mov [word low pgStart],0
  mov [word low pgShown],0
  mov ax,[segA000]
  mov [word high pgStart],ax
  mov [word high pgShown],ax

  push 0
  call clear           ;clear the screen

  mov di,offset yTab   ;now make a fast lookup table for scan lines
  push ds
  pop es
  mov ax,0
  mov bx,[lxBytes]
  mov cx,[lyRes]
  @@MAKETAB:
    stosw
    add ax,bx
    dec cx
    jnz @@MAKETAB

  mov al,1            ;mode set successfully!
  db 0A9h             ;test ax,imm16 instr to skip next 2-byte instr
@@BADMODE:            ;this is obviously either not a color vga
  xor ax,ax           ;or the logical dimensions were too big.
  ret
  endp


public setText

proc setText far
  mov al,[oldMode]
  mov ah,0
  int 10h
  ret
  endp


ends
end
