Accessing intel graphics card registers through I/O space and MMIO in UEFI - io

I am trying to write a code sequence, that will switch my intel graphics card into the legacy VGA mode after a call to Exitbootservices() in my uefi nasm bootloader. In order to do that I am to change some values in my graphics card's registers. My graphics card supports two ways of accessing its registers - through MMIO and through I/O space via a pair of I/O registers called MMIO_ADDRESS and MMIO_DATA. Base address for the MMIO and base port for the I/O access are obtained through the PCI configuration space registers. I successfully got both of the values from the PCI configuration space. Weirdly, trying to read any register value through the MMIO(MMIO_BAR + reg_offset) or through the pair of I/O registers always returns zero. Trying to write to any of the device's registers with bot ways, listed above, also results in nothing. I would like to know, what I miss, because clearly I am missing something. My guess is that, the graphics card register access may be locked by some graphics card feature. By the way, my intel graphics card is operated by the linux i915 driver. I validated MMIO BAR and I/O port base multiple times by: using lspci -vvv in Linux, by calling the pci util in the UEFI shell and by simply reading the values from the pci configuration space with my bootloader.
This is the code I have so far. I call get_mmap(), then I call exitbootservices(). After that I change the caching policy for the VGA framebuffer by changing values in the fixed-range MTRRs. After that I put some pixels on the screen(I chechked the buffer address via GOP in another bootloader) to see that PC is not stalled after a call to exitbootservices(). After that I call a function, that should disable all current display modes of my intel graphics card. Then I put more pixels on the screen to see that computer is not stalled, though, pixels should not be visible and the first sequence of pixels, I printed earlier, is expected to disappear after I alter the graphics card's register values. After that I call a sequence of functions, that put a VGA compatible graphics card into the VGA 13h mode(256 colors, 320x200)(confirmed to work on real hardware in 16-bit real-mode bootloader).
My code(jd9999_hdr_macro is an implementation of the UEFI PE header):
[BITS 64]
[DEFAULT ABS]
[ORG 0x00100000]
%include "jd9999_hdr_macro.inc"
jd9999_hdr_macro textsize, datasize, 0x00100000, textsize+datasize+1024
section .text follows=.header
start:
sub rsp, 6*8+8 ; Copied from Charles AP's implementation, fix stack alignment issue (Thanks Charles AP!)
mov qword [EFI_HANDLE], rcx
mov qword [EFI_SYSTEM_TABLE], rdx
mov rax, qword [EFI_SYSTEM_TABLE]
mov rax, qword [rax+96]
mov rax, qword [rax+56]
mov qword [get_mmap_boot_srvc], rax
mov rax, qword [EFI_SYSTEM_TABLE]
mov rax, qword [rax+64]
mov rax, qword [rax+8]
mov qword [efi_print], rax
mov rax, qword [EFI_SYSTEM_TABLE]
mov rax, qword [rax+96]
mov rax, qword [rax+232]
mov qword [exit_boot_services], rax
mov rcx, mmap_sz
mov rdx, MMap
mov r8, mmkey
mov r9, mmdsz
mov r10, mmdsv ; get_mmap
sub rsp, 32
call qword [get_mmap_boot_srvc]
add rsp, 32
mov rcx, qword [EFI_HANDLE] ; EBS
mov rdx, qword [mmkey]
xor r8, r8
sub rsp, 32
call qword [exit_boot_services]
add rsp, 32
mov rdi, 0x00000000a0000000
mov eax, 0x22822837 ; First sequence of pixels
mov rcx, 0x2223
cld
rep stosd
mov eax, 00000001000000010000000100000001b
mov edx, 00000001000000010000000100000001b ; Reprogram MTRRs
mov ecx, 0x00000259
wrmsr
mov rdi, 0x00000000c2000000;
call disable_all_display_modes ; Here I pass the MMIO BAR in rdi and put some stuff into registers through it. After this call the screen should become blank and any further attempts to print any pixels to it should not display anything.
mov rdi, 0x00000000a0000000 + 0x2223*4 ; Second sequence of pixels(should not be displayed)
mov eax, 0x33722818
mov rcx, 0x2223
cld
rep stosd
mov rsi, VGA13h
call set_regs
; VGA 13h sequence
mov rsi, palette256
call set_palette256
cld
mov rcx, 64000
mov rdi, 0xA0000
mov al, 60 ; Fill the screen in VGA 13h mode
rep stosb
cli
hlt
; rdi - MMIO BAR
align 8
disable_all_display_modes: ; A function that disables all current video modes through MMIO
mov eax, dword [rdi+0x61140]
and eax, 11011111111111111111111111111111b
mov dword [rdi+0x61140], eax ; Disable sDVO ports stall
mov eax, dword [rdi+0x61160]
and eax, 11011111111111111111111111111111b
mov dword [rdi+0x61160], eax
mov eax, dword [rdi+0x70080]
and eax, 11111111111111111111111111011000b ; CURACNTR - cursor A disable
mov dword [rdi+0x70080], eax
mov eax, dword [rdi+0x700c0]
and eax, 11111111111111111111111111011000b ; CURBCNTR - cursor B disable
mov dword [rdi+0x700c0], eax
mov eax, dword [rdi+0x70180]
and eax, 01111111111111111111111111111111b ; DSPACNTR - Plane A disable
mov dword [rdi+0x70180], eax
mov eax, dword [rdi+0x71180]
and eax, 01111111111111111111111111111111b ; DSPBCNTR - Plane B disable
mov dword [rdi+0x71180], eax
mov eax, dword [rdi+0x70008]
or eax, 00000000000011000000000000000000b ; PIPEACONF - Disable all planes and cursors
mov dword [rdi+0x70008], eax
mov eax, dword [rdi+0x70008]
;test eax, 10000000000000000000000000000000b
;jz ._PIPEA_skip_wait
and eax, 01111111111111111111111111111111b ; PIPEACONF - Disable PIPE
mov dword [rdi+0x70008], eax
;lea eax, [edi+0x70008]
;monitor ; this is probably the proper way of waiting for the pipes to disable
;mwait
;._PIPEA_skip_wait:
mov eax, dword [rdi+0x71008]
or eax, 00000000000011000000000000000000b ; PIPEBCONF - Disable all planes and cursors
mov dword [rdi+0x71008], eax
mov eax, dword [rdi+0x71008]
;test eax, 10000000000000000000000000000000b
;jz ._PIPEB_skip_wait
and eax, 01111111111111111111111111111111b ; PIPEBCONF - Disable PIPE
mov dword [rdi+0x71008], eax
;lea eax, [edi+0x71008]
;monitor ; this is probably the proper way of waiting for the pipes to disable
;mwait
;._PIPEB_skip_wait:
mov eax, dword [rdi+0x71400]
and eax, 01111111111111111111111111111111b ; Disable VGA display
mov dword [rdi+0x71400], eax
mov eax, dword [rdi+0x68000]
and eax, 11111111111111111111011111111111b ; Disable panel fitter
mov dword [rdi+0x68000], eax
mov eax, dword [rdi+0x6014]
and eax, 01101111111111111111111111111111b ; DPLLA_CTRL - DPLL VCO = 0(disabled), VGA mode = 0 (enabled)
mov dword [rdi+0x6014], eax
mov eax, dword [rdi+0x6018]
and eax, 01101111111111111111111111111111b ; DPLLB_CTRL - DPLL VCO = 0(disabled), VGA mode = 0 (enabled)
mov dword [rdi+0x6018], eax
ret
align 8
; dx - IOBAR
disable_all_display_modes_IO: ; A function that disables all current video modes through I/O
mov eax, 0x61140
out dx, eax
add dx, 4
in eax, dx
and eax, 11011111111111111111111111111111b
out dx, eax
sub dx, 4
mov eax, 0x61160
out dx, eax ; Disable sDVO ports stall
add dx, 4
in eax, dx
and eax, 11011111111111111111111111111111b
out dx, eax
sub dx, 4
mov eax, 0x70080
out dx, eax
add dx, 4 ; CURACNTR - cursor A disable
in eax, dx
and eax, 11111111111111111111111111011000b
out dx, eax
sub dx, 4
mov eax, 0x700c0
out dx, eax
add dx, 4 ; CURBCNTR - cursor B disable
in eax, dx
and eax, 11111111111111111111111111011000b
out dx, eax
sub dx, 4
mov eax, 0x70180
out dx, eax
add dx, 4 ; DSPACNTR - Plane A disable
in eax, dx
and eax, 01111111111111111111111111111111b
out dx, eax
sub dx, 4
mov eax, 0x71180
out dx, eax
add dx, 4 ; DSPBCNTR - Plane B disable
in eax, dx
and eax, 01111111111111111111111111111111b
out dx, eax
sub dx, 4
mov eax, 0x70008
out dx, eax
add dx, 4
in eax, dx
or eax, 00000000000011000000000000000000b ; PIPEACONF - Disable all planes and cursors
out dx, eax
sub dx, 4
mov eax, 0x71008
out dx, eax
add dx, 4
in eax, dx
or eax, 00000000000011000000000000000000b ; PIPEBCONF - Disable all planes and cursors
out dx, eax
sub dx, 4
mov eax, 0x71400
out dx, eax
add dx, 4
in eax, dx
and eax, 01111111111111111111111111111111b ; Disable VGA display
out dx, eax
sub dx, 4
mov eax, 0x68000
out dx, eax
add dx, 4
in eax, dx
and eax, 11111111111111111111011111111111b ; Disable panel fitter
out dx, eax
sub dx, 4
mov eax, 0x6014
out dx, eax
add dx, 4
in eax, dx
and eax, 01101111111111111111111111111111b ; DPLLA_CTRL - DPLL VCO = 0(disabled), VGA mode = 0 (enabled)
out dx, eax
sub dx, 4
mov eax, 0x6018
out dx, eax
add dx, 4
in eax, dx
and eax, 01101111111111111111111111111111b ; DPLLB_CTRL - DPLL VCO = 0(disabled), VGA mode = 0 (enabled)
out dx, eax
sub dx, 4
ret
align 8
set_regs: ; Set VGA registers for the 13h mode
xor rdx, rdx
xor rax, rax
xor rcx, rcx
cli
cld
mov dx, 0x3C2
outsb
mov dx, 0x3DA
outsb
xor cx, cx
mov dx, 0x3C4
.loop_CRTC_:
lodsb
xchg al, ah
mov al, cl
out dx, ax
inc cx
cmp cl, 4
jbe .loop_CRTC_
mov dx, 0x3D4
mov ax, 0x0E11
out dx, ax
xor cx, cx
mov dx, 0x3D4
.loop_CRTC_2:
lodsb
xchg al, ah
mov al, cl
out dx, ax
inc cx
cmp cl, 0x18
jbe .loop_CRTC_2
xor cx, cx
mov dx, 0x3CE
.loop_GC_:
lodsb
xchg al, ah
mov al, cl
out dx, ax
inc cx
cmp cl, 8
jbe .loop_GC_
mov dx, 0x3DA
in al, dx
xor cx, cx
mov dx, 0x3C0
.l4:
in ax, dx
mov al, cl
out dx, al
outsb
inc cx
cmp cl, 0x14
jbe .l4
mov al, 0x20
out dx, al
sti
ret
align 8
set_palette256: ; Set Vga palette
xor rdx, rdx
xor rax, rax
xor rcx, rcx
cld
.loop_:
mov dx, 0x03C8
out dx, al ; output index
inc dx ; port 0x3C9
mov cx, 3
;rep outsb
outsb ; red
outsb ; blue
outsb ; green
inc ax
cmp ax, 256
jl .loop_
ret
times 2048 - ($-$$) db 0 ;alignment
textsize equ $-$$
section .data follows=.text
dataStart:
tststr dw __utf16__(`test_\0`)
numretstr dw __utf16__(`0x0000000000000000\n\0`)
;Handover variables
EFI_HANDLE dq 0
EFI_SYSTEM_TABLE dq 0
get_mmap_boot_srvc dq 0
efi_print dq 0
exit_boot_services dq 0
memmap_UEFI:
type dd 0
phys_addr dq 0
virt_addr dq 0
num_pafes dq 0
attribute dq 0
mmap_sz dq 4096
mmdsz dq 48
mmkey dq 0
mmdsv dq 0
VGA13h db 0x63, 0x00, 0x03, 0x01, 0x0F, 0x00, 0x0E, 0x5F, 0x4F
db 0x50, 0x82, 0x54, 0x80, 0xBF, 0x1F, 0x00, 0x41, 0x00
db 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0x0E, 0x8F, 0x28
db 0x40, 0x96, 0xB9, 0xA3, 0xFF, 0x00, 0x00, 0x00, 0x00
db 0x00, 0x40, 0x05, 0x0F, 0xFF, 0x00, 0x01, 0x02, 0x03
db 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C
db 0x0D, 0x0E, 0x0F, 0x41, 0x00, 0x0F, 0x00, 0x00
palette256 db 00, 00, 00, 00, 10, 41, 12, 28, 18, 02, 43, 22, 35
db 19, 09, 58, 00, 00, 57, 35, 12, 43, 43, 47, 24, 24
db 28, 20, 24, 60, 10, 60, 15, 31, 47, 63, 62, 56, 20
db 60, 56, 22, 63, 61, 36, 63, 63, 63, 00, 00, 00, 05
db 05, 05, 08, 08, 08, 11, 11, 11, 14, 14, 14, 17, 17
db 17, 20, 20, 20, 24, 24, 24, 28, 28, 28, 32, 32, 32
db 36, 36, 36, 40, 40, 40, 45, 45, 45, 50, 50, 50, 56
db 56, 56, 63, 63, 63, 13, 12, 15, 15, 16, 22, 17, 20
db 29, 19, 24, 36, 21, 28, 43, 23, 31, 50, 25, 34, 57
db 26, 42, 63, 00, 15, 02, 01, 22, 04, 02, 29, 06, 03
db 36, 08, 04, 43, 10, 05, 50, 12, 06, 57, 14, 20, 63
db 40, 18, 06, 07, 25, 12, 11, 33, 17, 14, 40, 23, 18
db 48, 28, 21, 55, 34, 25, 62, 39, 27, 63, 48, 36, 15
db 03, 02, 22, 06, 04, 29, 09, 06, 36, 12, 08, 43, 15
db 10, 50, 18, 12, 57, 21, 14, 63, 28, 20, 15, 00, 00
db 22, 07, 00, 29, 15, 00, 36, 23, 00, 43, 31, 00, 50
db 39, 00, 57, 47, 00, 63, 55, 00, 15, 05, 03, 22, 11
db 07, 29, 17, 11, 36, 23, 15, 43, 29, 19, 50, 35, 23
db 57, 41, 27, 63, 53, 34, 28, 14, 12, 33, 20, 14, 38
db 26, 16, 43, 32, 18, 48, 38, 20, 53, 44, 22, 58, 50
db 24, 63, 56, 30, 05, 05, 06, 10, 10, 13, 15, 15, 20
db 20, 20, 27, 25, 25, 34, 30, 30, 41, 35, 35, 48, 44
db 44, 63, 03, 06, 05, 05, 11, 09, 07, 16, 13, 09, 21
db 17, 11, 26, 21, 13, 31, 25, 15, 36, 29, 20, 48, 38
db 06, 06, 07, 13, 13, 15, 20, 20, 23, 27, 27, 31, 34
db 34, 39, 41, 41, 47, 48, 48, 55, 57, 57, 63, 06, 15
db 04, 12, 22, 08, 18, 29, 12, 24, 36, 16, 30, 43, 20
db 36, 50, 24, 42, 57, 28, 54, 63, 35, 15, 10, 10, 22
db 16, 16, 29, 21, 21, 36, 27, 27, 43, 32, 32, 50, 38
db 38, 57, 43, 43, 63, 54, 54, 15, 15, 06, 22, 22, 12
db 29, 29, 18, 36, 36, 24, 43, 43, 30, 50, 50, 36, 57
db 57, 42, 63, 63, 54, 02, 04, 14, 06, 12, 21, 10, 20
db 28, 14, 28, 35, 18, 36, 42, 22, 44, 49, 26, 52, 56
db 36, 63, 63, 18, 04, 14, 24, 08, 21, 31, 12, 28, 37
db 16, 35, 44, 20, 42, 50, 24, 49, 57, 28, 56, 63, 38
db 63, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 53, 44, 22, 09
db 08, 12, 16, 14, 16, 22, 21, 20, 29, 27, 24, 35, 34
db 28, 42, 40, 32, 48, 47, 36, 57, 56, 43, 08, 12, 16
db 14, 16, 22, 21, 20, 29, 27, 24, 35, 34, 28, 42, 40
db 32, 48, 47, 36, 57, 56, 43, 63, 13, 09, 11, 21, 16
db 15, 27, 22, 18, 36, 29, 22, 42, 35, 25, 51, 42, 29
db 57, 48, 32, 63, 56, 39, 06, 14, 09, 12, 21, 14, 18
db 27, 22, 24, 33, 28, 30, 39, 36, 36, 46, 42, 42, 52
db 47, 50, 59, 53, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00
times 1024 - ($-$$) db 0 ;alignment
MMap:
times 4096 db 0
datasize equ $-$$
Photos of the computer screen after the program finished(from afar and a closeup with explanation):
As you can see, VGA 13h draws some weird stuff on the screen because previous graphics mode was not disabled. So, why does my code does not disable the current video mode of my intel graphics card(considering that the Base address of MMIO is alright)?.

Related

Assembly minimum, middle, max, sum, and integer average of a list of numbers

I need to create a simple x96-64 assembly language program to compute the min, middle value, max, sum and integer average of a list of numbers. When I try to assemble it gives me errors. My code so far:
; -----
; Define constants.
NULL equ 0 ; end of string
TRUE equ 1
FALSE equ 0
EXIT_SUCCESS equ 0 ; successful operation
SYS_exit equ 60 ; call code for terminate
; -----
lst dd 4220, -1116, 1542, 1240, 1677
dd -1635, 2426, 1820, 1246, -333
dd 2315, -215, 2726, 1140, 2565
dd 2871, 1614, 2418, 2513, 1422
dd -119, 1215, -1525, -712, 1441
dd -3622, -731, -1729, 1615, 2724
dd 1217, -224, 1580, 1147, 2324
dd 1425, 1816, 1262, -2718, 1192
dd -1435, 235, 2764, -1615, 1310
dd 1765, 1954, -967, 1515, 1556
dd 1342, 7321, 1556, 2727, 1227
dd -1927, 1382, 1465, 3955, 1435
dd -225, -2419, -2534, -1345, 2467
dd 1615, 1961, 1335, 2856, 2553
dd -1035, 1835, 1464, 1915, -1810
dd 1465, 1554, -267, 1615, 1656
dd 2192, -825, 1925, 2312, 1725
dd -2517, 1498, -670, 1475, 2030
dd 1223, 1883, -1173, 1350, 2415
dd -335, 1125, 1118, 1713, 3020
length dd 100
lstMin dd 0
lstMid dd 0
lstMax dd 0
lstSum dd 0
lstAve dd 0
evenCnt dd 0
evenSum dd 0
evenAve dd 0
tenCnt dd 0
tenSum dd 0
tenAve dd 0
; *****************************************************************
section .text
global _start
_start:
; ----------------------------------------------
mov rcx, 0
mov ecx, dword [length]
mov eax, dword[lst]
mov dword, [lst +lstMin] eax
mov dword, [lst +lstMax] eax
mov rsi, 0
mov dword [lst +lstSum],0
sumLp:
mov eax, dword [lst+rsi]
add dword [lst + lstSum],eax
cmp eax, dword [lst+lstMin]
jge minDone
mov dword[lstMin],eax
minDone:
cmp eax,dword [lstMax]
jle maxDone
mov dword[lstMax],eax
maxDone:
add rsi, 4
dec rcx
cmp rcx, 0
jne sumLp
mov eax, dword [lstSum]
cdq
idiv dword [length]
mov dword[lstAve],eax
; *****************************************************************
; Done, terminate program.
last:
mov eax, SYS_exit ; call call for exit (SYS_exit)
mov ebx, EXIT_SUCCESS ; return code of 0 (no error)
syscall
The errors I get when assembling with YASM are similar to:
myprog.asm:60: error: unexpected `,' after instruction
myprog.asm:61: error: unexpected `,' after instruction
Why am I getting these errors and how can I fix them?

all strings are printing on the console at the same time in assembly language 8086

I am having a problem with my assembly code I was making a linear search program and in that program i have initializes two strings; the string when user is asked to input a number and other when there is a positive result but the problem is that when i run my program, all strings are printing at the same time on the console.
note: there are some other errors also but please ignore them at this time.
include irvine32.inc
.data
searchArr WORD 1, 4, 7, 14, 299, 156, 3, 63, 29, 300, 20
user_input word ?
search_str byte "enter the number you want to search",0ah,0dh
yes_str byte "the number is in given array",0ah,0dh
.code
main proc
mov edx,offset search_str
call writestring
mov edx,0
; asking the number from user
call readint
mov user_input , ax
mov eax,0
;seacrhing the array
mov ecx, (lengthof searchArr)
mov esi,0
search:
mov bx,searchArr[esi * type searchArr]
cmp bx,user_input
je yes
inc esi
loop search
yes:
mov edx,0
mov edx,offset yes_str
call writestring
exit
main endp
end main

Assembly: how to convert number into ascii and write to display buffer

I am new to assembly and am programming in linux 64 bit in AT&T syntax. If I store the number 1 in a register, how can I translate that to the ascii character "A"? For example:
movl $1, %ebx
addl $64, %ebx
Can I add 64 to 1 to make 65 (the decimal value of A), then somehow convert it to "A" and send this to the buffer using write system call?
EDIT 1: Posting my program code here.
.section .data
message:
.long 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
length:
.long 10
.section .text
.globl _start
_start:
xorq %rdi, %rdi
xorq %rax, %rax
xorq %rbx, %rbx
xorq %rcx, %rcx
xorq %rdx, %rdx
movl length, %edx
loop:
cmpl %ecx, %edx
je loop_end
movl message(,%rdi,4), %eax
addl $64, %eax
pushq %rax
incq %rdi
incq %rcx
jmp loop
loop_end:
cmpq $0, %rcx
je exit
popq %rbx
pushq %rcx
movq $1, %rax
movq $1, %rdi
movq %rbx, %rsi
movl length, %edx
syscall
popq %rcx
decq %rcx
jmp loop_end
exit:
movq $60, %rax
movq $0, %rdi
syscall
I'm not entirely familiar with AT&T syntax, but the disassembly of NASM in what you're accustomed to should suffice.
You should try to avoid what is called hard coding constants as it makes your program harder to maintain, especially when it's hundreds if not thousands of lines in length. Therefore;
section .data
Values: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 26, 18, 12, 20, 19, 11
V_Size equ $ - Values
is preferable to this
message:
.long 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
length:
.long 10
What you did is not wrong, but the method is predicated upon you counting, not the assembler. As it has already been pointed out, use the smallest data size required to get the job done. In this case char is better than long
This code in NASM
section .text
global _start
_start: xor ecx, ecx
push rcx ; Applications default return value
mov cl, V_Size
push rcx
mov ebx, Values
push rbx
Next:
or byte [ebx], 64
inc ebx
loop Next
pop rsi
pop rdx
pop rax
inc al
mov edi, eax
syscall
mov edi, eax
dec edi
mov eax, edi
mov al, 60
syscall
section .data
Values: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 26, 18, 12, 20, 19, 11
V_Size equ $ - Values
will yield
ABCDEFGHIJZRLTSK
with command prompt immediatly after "K".
section .data:
6000d8 01020304 05060708 090a1a12 0c14130b
section .text:
<_start>: These two instructions are idiosyncratic to my style of programming and not
essential to functionality of program.
4000b0: 31 c9 xor %ecx,%ecx
4000b2: 51 push %rcx
Setup RCX & RBX for LOOP instruction
4000b3: b1 10 mov $0x10,%cl
4000b5: 51 push %rcx ARG2 to syscall
4000b6: bb d8 00 60 00 mov $0x6000d8,%ebx
4000bb: 53 push %rbx ARG1 to syscall
<Next>: This conforms to the scope of your objective.
4000bc: 67 80 0b 40 orb $0x40,(%ebx) [ebx] += 'A'
4000c0: ff c3 inc %ebx
4000c2: e2 f8 loop 4000bc <Next>
ssize_t write (int fd, const void *buf, size_t count);
4000c4: 5e pop %rsi ARG1 = ASCII Pntr
4000c5: 5a pop %rdx ARG2 = # of chars
4000c6: 58 pop %rax
4000c7: fe c0 inc %al SYS_WRITE
4000c9: 89 c7 mov %eax,%edi ARG0 = STD_OUT
4000cb: 0f 05 syscall
Epilogue: Again, just a method I use.
4000cd: 89 c7 mov %eax,%edi
4000cf: ff cf dec %edi
4000d1: 89 f8 mov %edi,%eax
4000d3: b0 3c mov $0x3c,%al
4000d5: 0f 05 syscall

"The value of ESP was not properly saved across a function call." even with LEAVE

When attempting to call a method which is defined in assembly, I'm receiving the error "The value of ESP was not properly saved across a function call.", using Visual Studio 2012. Looking at other questions, a common factor was the mention that the assembly may not have the LEAVE instruction at the end of each label.
I am receiving this error, but with the following code, which does include the LEAVE instruction.
section .bss
vs: resb 13 ; 12-byte vendor string + NULL char
ns: resb 49 ; 48-byte proc. name + NULL char
section .text
global _meta_vendor
global _meta_procname
_meta_vendor:
push ebp
mov ebp, esp
mov eax, 0h
cpuid
mov [vs], ebx
mov [vs + 4], edx
mov [vs + 8], ecx
mov byte [vs + 12], 0h
mov eax, vs
leave
ret
_meta_procname:
push ebp
mov ebp, esp
mov eax, 80000002h
cpuid
mov [ns], eax
mov [ns + 4], ebx
mov [ns + 8], ecx
mov [ns + 12], edx
mov eax, 80000003h
cpuid
mov [ns + 16], eax
mov [ns + 20], ebx
mov [ns + 24], ecx
mov [ns + 28], edx
mov eax, 80000004h
cpuid
mov [ns + 32], eax
mov [ns + 36], ebx
mov [ns + 40], ecx
mov [ns + 44], edx
mov byte [ns + 48], 0h
mov eax, ns
leave
ret
Function prototypes for those labels are then in a header file which contains only this:
#include <cstdint>
extern "C" {
char* meta_vendor();
char* meta_procname();
}
Any insight as to why I am receiving this error?
Note that, if I click "Continue" on the popup which appears, the expected values do appear.
The problem was solved by adding push ebx and pop ebx as is shown below:
_meta_vendor:
push ebp
mov ebp, esp
push ebx
; code...
pop ebx
leave
ret

Faster assembly optimized way to convert between 8-bit grayscale and RGB32 image with SSE

I'm trying to find an optimized method for RGB8 (actually grayscale) to RGB32 image conversion.
Source is an 8 bits grey image, Destination should be an 32 bits grey image (BGRA) with 4th channel (alpha) to be ignored. Source address is not guaranteed to be 16 byte aligned, Count is a multiple of 16, Destination address is 16 byte aligned.
INPUT: 8 bits single channel grey image
OUTPUT: 32 bits BGRA (alpha channel ignored)
COUNT: Image size is a multiple of 16
CPU: x86-32 (SSE2/SSE3 allowed)
Here is my optimized assembly code. Is there an even faster way of conversion?
void ConvertGreyToRgb32Assembler(__m128i* Source, __m128i* Destination, unsigned int Count) {
static unsigned int __declspec(align(64)) Masks[] = {
0x80000000, 0x80010101, 0x80020202, 0x80030303,
0x80040404, 0x80050505, 0x80060606, 0x80070707,
0x80080808, 0x80090909, 0x800a0a0a, 0x800b0b0b,
0x800c0c0c, 0x800d0d0d, 0x800e0e0e, 0x800f0f0f
};
__asm {
mov esi, Source
mov edi, Destination
mov edx, Count
xor ecx, ecx
movdqa xmm4, xmmword ptr [Masks + 0]
movdqa xmm5, xmmword ptr [Masks + 16]
movdqa xmm6, xmmword ptr [Masks + 32]
movdqa xmm7, xmmword ptr [Masks + 48]
l1:
movdqu xmm0, xmmword ptr [esi + ecx]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
pshufb xmm0, xmm4
pshufb xmm1, xmm5
pshufb xmm2, xmm6
pshufb xmm3, xmm7
movntdq [edi + 0], xmm0
movntdq [edi + 16], xmm1
movntdq [edi + 32], xmm2
movntdq [edi + 48], xmm3
add edi, 64
add ecx, 16
cmp ecx, edx
jb l1
}
}
There is another approach using several PUNPCKLBW and PUNPCKHBW but that seems to be slightly slower.
Update: This is the basic non optimized algorithm:
BGRA* Destination = ...
unsigned char* Source ...
for (unsigned int i = 0; i < Size; i++) {
Destination[i].Blue = Source[i];
Destination[i].Green = Source[i];
Destination[i].Red = Source[i];
}
PS: I also tried using C code with MS VS2008 SSE compiler intrinsics. It turned out that the compiler generated a lot of unnecessary memory moves which causes the code to be 10-20% slower than pure assembly.
Update 2: This is the same code by using intrinsics only.
void ConvertGreyToRgb32Assembler(__m128i* Source, __m128i* Destination, unsigned int Count) {
static const unsigned int __declspec(align(64)) Masks[] = {
0x80000000, 0x80010101, 0x80020202, 0x80030303,
0x80040404, 0x80050505, 0x80060606, 0x80070707,
0x80080808, 0x80090909, 0x800a0a0a, 0x800b0b0b,
0x800c0c0c, 0x800d0d0d, 0x800e0e0e, 0x800f0f0f
};
register __m128i m0 = _mm_load_si128((__m128i*) (Masks + 0));
register __m128i m1 = _mm_load_si128((__m128i*) (Masks + 4));
register __m128i m2 = _mm_load_si128((__m128i*) (Masks + 8));
register __m128i m3 = _mm_load_si128((__m128i*) (Masks + 12));
for (unsigned int i = 0; i < Count / 16; i++) {
__m128i r0 = _mm_load_si128(Source + i);
_mm_stream_si128(Destination + (i * 4) + 0, _mm_shuffle_epi8(r0, m0));
_mm_stream_si128(Destination + (i * 4) + 1, _mm_shuffle_epi8(r0, m1));
_mm_stream_si128(Destination + (i * 4) + 2, _mm_shuffle_epi8(r0, m2));
_mm_stream_si128(Destination + (i * 4) + 3, _mm_shuffle_epi8(r0, m3));
}
}
Update 3: This is the compiler generated code (beautified) (Visual Studio 2012, all optimization on):
push ebp
mov ebp, esp
mov edx, dword ptr [ebp+8]
movdqa xmm1, xmmword ptr ds:[Masks + 0]
movdqa xmm2, xmmword ptr ds:[Masks + 16]
movdqa xmm3, xmmword ptr ds:[Masks + 32]
movdqa xmm4, xmmword ptr ds:[Masks + 48]
push esi
test ecx, ecx
je l2
lea esi, [ecx-1]
shr esi, 4
inc esi
l1:
mov ecx, edx
movdqu xmm0, xmmword ptr [ecx]
mov ecx, eax
movdqa xmm5, xmm0
pshufb xmm5, xmm1
movdqa xmmword ptr [ecx], xmm5
movdqa xmm5, xmm0
pshufb xmm5, xmm2
movdqa xmmword ptr [eax+10h], xmm5
movdqa xmm5, xmm0
pshufb xmm5, xmm3
movdqa xmmword ptr [eax+20h], xmm5
lea ecx, [eax+30h]
add edx, 10h
add eax, 40h
dec esi
pshufb xmm0, xmm4
movdqa xmmword ptr [ecx], xmm0
jne l1
l2:
pop esi
pop ebp
ret
It seems that interleaving movdqa with pshufb is some what faster.
Update 4: This seems to be the optimal hand optimized code:
__asm {
mov esi, Source
mov edi, Destination
mov ecx, Count
movdqu xmm0, xmmword ptr [esi]
movdqa xmm4, xmmword ptr [Masks + 0]
movdqa xmm5, xmmword ptr [Masks + 16]
movdqa xmm6, xmmword ptr [Masks + 32]
movdqa xmm7, xmmword ptr [Masks + 48]
l1:
dec ecx
lea edi, [ edi + 64 ]
lea esi, [ esi + 16 ]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
pshufb xmm0, xmm4
movdqa [edi - 64], xmm0
pshufb xmm1, xmm5
movdqa [edi - 48], xmm1
pshufb xmm2, xmm6
movdqa [edi - 32], xmm2
pshufb xmm3, xmm7
movdqa [edi - 16], xmm3
movdqu xmm0, xmmword ptr [esi]
ja l1
}
Update 5: This conversion algorithm uses punpck instruction. However this conversion routine is a bit slower than using masks and pushfb.
for (unsigned int i = 0; i < Count; i += 16) {
register __m128i r0 = _mm_load_si128(Source++);
register __m128i r1 = _mm_unpackhi_epi8(r0, r0);
register __m128i r2 = _mm_unpacklo_epi8(r0, r0);
register __m128i r3 = _mm_unpackhi_epi8(r1, r1);
register __m128i r4 = _mm_unpacklo_epi8(r1, r1);
register __m128i r5 = _mm_unpackhi_epi8(r2, r2);
register __m128i r6 = _mm_unpacklo_epi8(r2, r2);
_mm_store_si128(Destination++, r6);
_mm_store_si128(Destination++, r5);
_mm_store_si128(Destination++, r4);
_mm_store_si128(Destination++, r3);
}
Update 6: For the sake of completeness this is the inverse method to convert from 32 bits back to 8 bits grey image.
static void ConvertRgb32ToGrey(const __m128i* Source, __m128i* Destination, unsigned int Count) {
static const unsigned char __declspec(align(64)) Masks[] = {
0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c,
};
register __m128i m0 = _mm_load_si128((__m128i*) (Masks + 0));
register __m128i m1 = _mm_load_si128((__m128i*) (Masks + 16));
register __m128i m2 = _mm_load_si128((__m128i*) (Masks + 32));
register __m128i m3 = _mm_load_si128((__m128i*) (Masks + 48));
for (unsigned int i = 0; i < Count / 64; i++) {
__m128i a = _mm_load_si128(Source + (i * 4) + 0);
__m128i b = _mm_load_si128(Source + (i * 4) + 1);
__m128i c = _mm_load_si128(Source + (i * 4) + 2);
__m128i d = _mm_load_si128(Source + (i * 4) + 3);
a = _mm_shuffle_epi8(a, m0);
b = _mm_shuffle_epi8(b, m1);
c = _mm_shuffle_epi8(c, m2);
d = _mm_shuffle_epi8(d, m3);
__m128i e = _mm_or_si128(a, b);
__m128i f = _mm_or_si128(c, d);
__m128i g = _mm_or_si128(e, f);
_mm_stream_si128(Destination + i, g);
}
}
Would try:
__asm {
mov esi, Source
mov edi, Destination
mov ecx, Count
movdqu xmm0, xmmword ptr [esi]
movdqa xmm4, xmmword ptr [Masks + 0]
movdqa xmm5, xmmword ptr [Masks + 16]
movdqa xmm6, xmmword ptr [Masks + 32]
movdqa xmm7, xmmword ptr [Masks + 48]
l1:
dec ecx // modern Intel can macro-fuse this with jnz if adjacent
lea edi, [ edi + 64 ]
lea esi, [ esi + 16 ]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
pshufb xmm0, xmm4
pshufb xmm1, xmm5
pshufb xmm2, xmm6
pshufb xmm3, xmm7
movntdq [edi - 64], xmm0
movntdq [edi - 48], xmm1
movntdq [edi - 32], xmm2
movntdq [edi - 16], xmm3
movdqu xmm0, xmmword ptr [esi]
jnz l1
}
Haven't benchmarked it though; assumptions behind these changes:
the movdqu xmm0,... latency can be a little more hidden within the loop (your code has the load of xmm0 followed directly by an instruction using the value in that register)
the add ops on two regs as well as the cmp aren't really all necessary; address generation (lea) and the implicit zero test by dec/jnz can be used. That way, there'll be no EFLAGS dependencies caused by operations on ecx/esi/edi as the only ALU op in the loop is decrementing the loop counter.
In the end, this is likely load/store bound in any case so the arithmetics are "free game"; I therefore expect little difference, even with the arguments as given.
If the input is large, then it'd make sense to strip the "unaligned head/tail" off, i.e. to do a duff's device for the first/last [0..15] bytes, and the main loop using movdqa.
Edit:
Running your intrinsics sources through gcc -msse4.2 -O8 -c (GCC 4.7.1) gives the following assembly:
Disassembly of section .text:
0000000000000000 <ConvertGreyToRgb32Assembler>:
0: 85 d2 test edx,edx
2: 74 76 je 7a <ConvertGreyToRgb32Assembler+0x7a>
4: 66 0f 6f 2d 00 00 00 00 movdqa xmm5,XMMWORD PTR [rip+0x0]
# c <ConvertGreyToRgb32Assembler+0xc>
c: 48 89 f8 mov rax,rdi
f: 66 0f 6f 25 00 00 00 00 movdqa xmm4,XMMWORD PTR [rip+0x0]
# 17 <ConvertGreyToRgb32Assembler+0x17>
17: 66 0f 6f 1d 00 00 00 00 movdqa xmm3,XMMWORD PTR [rip+0x0]
# 1f <ConvertGreyToRgb32Assembler+0x1f>
1f: 66 0f 6f 15 00 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0]
# 27 <ConvertGreyToRgb32Assembler+0x27>
27: 66 0f 1f 84 00 00 00 00 00 nop WORD PTR [rax+rax*1+0x0]
30: f3 0f 6f 00 movdqu xmm0,XMMWORD PTR [rax]
34: 48 89 f1 mov rcx,rsi
37: 48 83 c0 10 add rax,0x10
3b: 66 0f 6f c8 movdqa xmm1,xmm0
3f: 66 0f 38 00 cd pshufb xmm1,xmm5
44: 66 0f e7 0e movntdq XMMWORD PTR [rsi],xmm1
48: 66 0f 6f c8 movdqa xmm1,xmm0
4c: 66 0f 38 00 cc pshufb xmm1,xmm4
51: 66 0f e7 4e 10 movntdq XMMWORD PTR [rsi+0x10],xmm1
56: 66 0f 6f c8 movdqa xmm1,xmm0
5a: 66 0f 38 00 c2 pshufb xmm0,xmm2
5f: 66 0f 38 00 cb pshufb xmm1,xmm3
64: 66 0f e7 4e 20 movntdq XMMWORD PTR [rsi+0x20],xmm1
69: 66 0f e7 41 30 movntdq XMMWORD PTR [rcx+0x30],xmm0
6e: 89 c1 mov ecx,eax
70: 29 f9 sub ecx,edi
72: 48 83 c6 40 add rsi,0x40
76: 39 ca cmp edx,ecx
78: 77 b6 ja 30 <ConvertGreyToRgb32Assembler+0x30>
7a: f3 c3 repz ret
This reminds me extremely strongly of your initial assembly code. If MSVC creates something significantly worse than that, I'd say it's a bug/limitation in the compiler (version) you used.

Resources