Accessing intel graphics card registers through I/O space and MMIO in UEFI - io
I am trying to write a code sequence, that will switch my intel graphics card into the legacy VGA mode after a call to Exitbootservices() in my uefi nasm bootloader. In order to do that I am to change some values in my graphics card's registers. My graphics card supports two ways of accessing its registers - through MMIO and through I/O space via a pair of I/O registers called MMIO_ADDRESS and MMIO_DATA. Base address for the MMIO and base port for the I/O access are obtained through the PCI configuration space registers. I successfully got both of the values from the PCI configuration space. Weirdly, trying to read any register value through the MMIO(MMIO_BAR + reg_offset) or through the pair of I/O registers always returns zero. Trying to write to any of the device's registers with bot ways, listed above, also results in nothing. I would like to know, what I miss, because clearly I am missing something. My guess is that, the graphics card register access may be locked by some graphics card feature. By the way, my intel graphics card is operated by the linux i915 driver. I validated MMIO BAR and I/O port base multiple times by: using lspci -vvv in Linux, by calling the pci util in the UEFI shell and by simply reading the values from the pci configuration space with my bootloader.
This is the code I have so far. I call get_mmap(), then I call exitbootservices(). After that I change the caching policy for the VGA framebuffer by changing values in the fixed-range MTRRs. After that I put some pixels on the screen(I chechked the buffer address via GOP in another bootloader) to see that PC is not stalled after a call to exitbootservices(). After that I call a function, that should disable all current display modes of my intel graphics card. Then I put more pixels on the screen to see that computer is not stalled, though, pixels should not be visible and the first sequence of pixels, I printed earlier, is expected to disappear after I alter the graphics card's register values. After that I call a sequence of functions, that put a VGA compatible graphics card into the VGA 13h mode(256 colors, 320x200)(confirmed to work on real hardware in 16-bit real-mode bootloader).
My code(jd9999_hdr_macro is an implementation of the UEFI PE header):
[BITS 64]
[DEFAULT ABS]
[ORG 0x00100000]
%include "jd9999_hdr_macro.inc"
jd9999_hdr_macro textsize, datasize, 0x00100000, textsize+datasize+1024
section .text follows=.header
start:
sub rsp, 6*8+8 ; Copied from Charles AP's implementation, fix stack alignment issue (Thanks Charles AP!)
mov qword [EFI_HANDLE], rcx
mov qword [EFI_SYSTEM_TABLE], rdx
mov rax, qword [EFI_SYSTEM_TABLE]
mov rax, qword [rax+96]
mov rax, qword [rax+56]
mov qword [get_mmap_boot_srvc], rax
mov rax, qword [EFI_SYSTEM_TABLE]
mov rax, qword [rax+64]
mov rax, qword [rax+8]
mov qword [efi_print], rax
mov rax, qword [EFI_SYSTEM_TABLE]
mov rax, qword [rax+96]
mov rax, qword [rax+232]
mov qword [exit_boot_services], rax
mov rcx, mmap_sz
mov rdx, MMap
mov r8, mmkey
mov r9, mmdsz
mov r10, mmdsv ; get_mmap
sub rsp, 32
call qword [get_mmap_boot_srvc]
add rsp, 32
mov rcx, qword [EFI_HANDLE] ; EBS
mov rdx, qword [mmkey]
xor r8, r8
sub rsp, 32
call qword [exit_boot_services]
add rsp, 32
mov rdi, 0x00000000a0000000
mov eax, 0x22822837 ; First sequence of pixels
mov rcx, 0x2223
cld
rep stosd
mov eax, 00000001000000010000000100000001b
mov edx, 00000001000000010000000100000001b ; Reprogram MTRRs
mov ecx, 0x00000259
wrmsr
mov rdi, 0x00000000c2000000;
call disable_all_display_modes ; Here I pass the MMIO BAR in rdi and put some stuff into registers through it. After this call the screen should become blank and any further attempts to print any pixels to it should not display anything.
mov rdi, 0x00000000a0000000 + 0x2223*4 ; Second sequence of pixels(should not be displayed)
mov eax, 0x33722818
mov rcx, 0x2223
cld
rep stosd
mov rsi, VGA13h
call set_regs
; VGA 13h sequence
mov rsi, palette256
call set_palette256
cld
mov rcx, 64000
mov rdi, 0xA0000
mov al, 60 ; Fill the screen in VGA 13h mode
rep stosb
cli
hlt
; rdi - MMIO BAR
align 8
disable_all_display_modes: ; A function that disables all current video modes through MMIO
mov eax, dword [rdi+0x61140]
and eax, 11011111111111111111111111111111b
mov dword [rdi+0x61140], eax ; Disable sDVO ports stall
mov eax, dword [rdi+0x61160]
and eax, 11011111111111111111111111111111b
mov dword [rdi+0x61160], eax
mov eax, dword [rdi+0x70080]
and eax, 11111111111111111111111111011000b ; CURACNTR - cursor A disable
mov dword [rdi+0x70080], eax
mov eax, dword [rdi+0x700c0]
and eax, 11111111111111111111111111011000b ; CURBCNTR - cursor B disable
mov dword [rdi+0x700c0], eax
mov eax, dword [rdi+0x70180]
and eax, 01111111111111111111111111111111b ; DSPACNTR - Plane A disable
mov dword [rdi+0x70180], eax
mov eax, dword [rdi+0x71180]
and eax, 01111111111111111111111111111111b ; DSPBCNTR - Plane B disable
mov dword [rdi+0x71180], eax
mov eax, dword [rdi+0x70008]
or eax, 00000000000011000000000000000000b ; PIPEACONF - Disable all planes and cursors
mov dword [rdi+0x70008], eax
mov eax, dword [rdi+0x70008]
;test eax, 10000000000000000000000000000000b
;jz ._PIPEA_skip_wait
and eax, 01111111111111111111111111111111b ; PIPEACONF - Disable PIPE
mov dword [rdi+0x70008], eax
;lea eax, [edi+0x70008]
;monitor ; this is probably the proper way of waiting for the pipes to disable
;mwait
;._PIPEA_skip_wait:
mov eax, dword [rdi+0x71008]
or eax, 00000000000011000000000000000000b ; PIPEBCONF - Disable all planes and cursors
mov dword [rdi+0x71008], eax
mov eax, dword [rdi+0x71008]
;test eax, 10000000000000000000000000000000b
;jz ._PIPEB_skip_wait
and eax, 01111111111111111111111111111111b ; PIPEBCONF - Disable PIPE
mov dword [rdi+0x71008], eax
;lea eax, [edi+0x71008]
;monitor ; this is probably the proper way of waiting for the pipes to disable
;mwait
;._PIPEB_skip_wait:
mov eax, dword [rdi+0x71400]
and eax, 01111111111111111111111111111111b ; Disable VGA display
mov dword [rdi+0x71400], eax
mov eax, dword [rdi+0x68000]
and eax, 11111111111111111111011111111111b ; Disable panel fitter
mov dword [rdi+0x68000], eax
mov eax, dword [rdi+0x6014]
and eax, 01101111111111111111111111111111b ; DPLLA_CTRL - DPLL VCO = 0(disabled), VGA mode = 0 (enabled)
mov dword [rdi+0x6014], eax
mov eax, dword [rdi+0x6018]
and eax, 01101111111111111111111111111111b ; DPLLB_CTRL - DPLL VCO = 0(disabled), VGA mode = 0 (enabled)
mov dword [rdi+0x6018], eax
ret
align 8
; dx - IOBAR
disable_all_display_modes_IO: ; A function that disables all current video modes through I/O
mov eax, 0x61140
out dx, eax
add dx, 4
in eax, dx
and eax, 11011111111111111111111111111111b
out dx, eax
sub dx, 4
mov eax, 0x61160
out dx, eax ; Disable sDVO ports stall
add dx, 4
in eax, dx
and eax, 11011111111111111111111111111111b
out dx, eax
sub dx, 4
mov eax, 0x70080
out dx, eax
add dx, 4 ; CURACNTR - cursor A disable
in eax, dx
and eax, 11111111111111111111111111011000b
out dx, eax
sub dx, 4
mov eax, 0x700c0
out dx, eax
add dx, 4 ; CURBCNTR - cursor B disable
in eax, dx
and eax, 11111111111111111111111111011000b
out dx, eax
sub dx, 4
mov eax, 0x70180
out dx, eax
add dx, 4 ; DSPACNTR - Plane A disable
in eax, dx
and eax, 01111111111111111111111111111111b
out dx, eax
sub dx, 4
mov eax, 0x71180
out dx, eax
add dx, 4 ; DSPBCNTR - Plane B disable
in eax, dx
and eax, 01111111111111111111111111111111b
out dx, eax
sub dx, 4
mov eax, 0x70008
out dx, eax
add dx, 4
in eax, dx
or eax, 00000000000011000000000000000000b ; PIPEACONF - Disable all planes and cursors
out dx, eax
sub dx, 4
mov eax, 0x71008
out dx, eax
add dx, 4
in eax, dx
or eax, 00000000000011000000000000000000b ; PIPEBCONF - Disable all planes and cursors
out dx, eax
sub dx, 4
mov eax, 0x71400
out dx, eax
add dx, 4
in eax, dx
and eax, 01111111111111111111111111111111b ; Disable VGA display
out dx, eax
sub dx, 4
mov eax, 0x68000
out dx, eax
add dx, 4
in eax, dx
and eax, 11111111111111111111011111111111b ; Disable panel fitter
out dx, eax
sub dx, 4
mov eax, 0x6014
out dx, eax
add dx, 4
in eax, dx
and eax, 01101111111111111111111111111111b ; DPLLA_CTRL - DPLL VCO = 0(disabled), VGA mode = 0 (enabled)
out dx, eax
sub dx, 4
mov eax, 0x6018
out dx, eax
add dx, 4
in eax, dx
and eax, 01101111111111111111111111111111b ; DPLLB_CTRL - DPLL VCO = 0(disabled), VGA mode = 0 (enabled)
out dx, eax
sub dx, 4
ret
align 8
set_regs: ; Set VGA registers for the 13h mode
xor rdx, rdx
xor rax, rax
xor rcx, rcx
cli
cld
mov dx, 0x3C2
outsb
mov dx, 0x3DA
outsb
xor cx, cx
mov dx, 0x3C4
.loop_CRTC_:
lodsb
xchg al, ah
mov al, cl
out dx, ax
inc cx
cmp cl, 4
jbe .loop_CRTC_
mov dx, 0x3D4
mov ax, 0x0E11
out dx, ax
xor cx, cx
mov dx, 0x3D4
.loop_CRTC_2:
lodsb
xchg al, ah
mov al, cl
out dx, ax
inc cx
cmp cl, 0x18
jbe .loop_CRTC_2
xor cx, cx
mov dx, 0x3CE
.loop_GC_:
lodsb
xchg al, ah
mov al, cl
out dx, ax
inc cx
cmp cl, 8
jbe .loop_GC_
mov dx, 0x3DA
in al, dx
xor cx, cx
mov dx, 0x3C0
.l4:
in ax, dx
mov al, cl
out dx, al
outsb
inc cx
cmp cl, 0x14
jbe .l4
mov al, 0x20
out dx, al
sti
ret
align 8
set_palette256: ; Set Vga palette
xor rdx, rdx
xor rax, rax
xor rcx, rcx
cld
.loop_:
mov dx, 0x03C8
out dx, al ; output index
inc dx ; port 0x3C9
mov cx, 3
;rep outsb
outsb ; red
outsb ; blue
outsb ; green
inc ax
cmp ax, 256
jl .loop_
ret
times 2048 - ($-$$) db 0 ;alignment
textsize equ $-$$
section .data follows=.text
dataStart:
tststr dw __utf16__(`test_\0`)
numretstr dw __utf16__(`0x0000000000000000\n\0`)
;Handover variables
EFI_HANDLE dq 0
EFI_SYSTEM_TABLE dq 0
get_mmap_boot_srvc dq 0
efi_print dq 0
exit_boot_services dq 0
memmap_UEFI:
type dd 0
phys_addr dq 0
virt_addr dq 0
num_pafes dq 0
attribute dq 0
mmap_sz dq 4096
mmdsz dq 48
mmkey dq 0
mmdsv dq 0
VGA13h db 0x63, 0x00, 0x03, 0x01, 0x0F, 0x00, 0x0E, 0x5F, 0x4F
db 0x50, 0x82, 0x54, 0x80, 0xBF, 0x1F, 0x00, 0x41, 0x00
db 0x00, 0x00, 0x00, 0x00, 0x00, 0x9C, 0x0E, 0x8F, 0x28
db 0x40, 0x96, 0xB9, 0xA3, 0xFF, 0x00, 0x00, 0x00, 0x00
db 0x00, 0x40, 0x05, 0x0F, 0xFF, 0x00, 0x01, 0x02, 0x03
db 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C
db 0x0D, 0x0E, 0x0F, 0x41, 0x00, 0x0F, 0x00, 0x00
palette256 db 00, 00, 00, 00, 10, 41, 12, 28, 18, 02, 43, 22, 35
db 19, 09, 58, 00, 00, 57, 35, 12, 43, 43, 47, 24, 24
db 28, 20, 24, 60, 10, 60, 15, 31, 47, 63, 62, 56, 20
db 60, 56, 22, 63, 61, 36, 63, 63, 63, 00, 00, 00, 05
db 05, 05, 08, 08, 08, 11, 11, 11, 14, 14, 14, 17, 17
db 17, 20, 20, 20, 24, 24, 24, 28, 28, 28, 32, 32, 32
db 36, 36, 36, 40, 40, 40, 45, 45, 45, 50, 50, 50, 56
db 56, 56, 63, 63, 63, 13, 12, 15, 15, 16, 22, 17, 20
db 29, 19, 24, 36, 21, 28, 43, 23, 31, 50, 25, 34, 57
db 26, 42, 63, 00, 15, 02, 01, 22, 04, 02, 29, 06, 03
db 36, 08, 04, 43, 10, 05, 50, 12, 06, 57, 14, 20, 63
db 40, 18, 06, 07, 25, 12, 11, 33, 17, 14, 40, 23, 18
db 48, 28, 21, 55, 34, 25, 62, 39, 27, 63, 48, 36, 15
db 03, 02, 22, 06, 04, 29, 09, 06, 36, 12, 08, 43, 15
db 10, 50, 18, 12, 57, 21, 14, 63, 28, 20, 15, 00, 00
db 22, 07, 00, 29, 15, 00, 36, 23, 00, 43, 31, 00, 50
db 39, 00, 57, 47, 00, 63, 55, 00, 15, 05, 03, 22, 11
db 07, 29, 17, 11, 36, 23, 15, 43, 29, 19, 50, 35, 23
db 57, 41, 27, 63, 53, 34, 28, 14, 12, 33, 20, 14, 38
db 26, 16, 43, 32, 18, 48, 38, 20, 53, 44, 22, 58, 50
db 24, 63, 56, 30, 05, 05, 06, 10, 10, 13, 15, 15, 20
db 20, 20, 27, 25, 25, 34, 30, 30, 41, 35, 35, 48, 44
db 44, 63, 03, 06, 05, 05, 11, 09, 07, 16, 13, 09, 21
db 17, 11, 26, 21, 13, 31, 25, 15, 36, 29, 20, 48, 38
db 06, 06, 07, 13, 13, 15, 20, 20, 23, 27, 27, 31, 34
db 34, 39, 41, 41, 47, 48, 48, 55, 57, 57, 63, 06, 15
db 04, 12, 22, 08, 18, 29, 12, 24, 36, 16, 30, 43, 20
db 36, 50, 24, 42, 57, 28, 54, 63, 35, 15, 10, 10, 22
db 16, 16, 29, 21, 21, 36, 27, 27, 43, 32, 32, 50, 38
db 38, 57, 43, 43, 63, 54, 54, 15, 15, 06, 22, 22, 12
db 29, 29, 18, 36, 36, 24, 43, 43, 30, 50, 50, 36, 57
db 57, 42, 63, 63, 54, 02, 04, 14, 06, 12, 21, 10, 20
db 28, 14, 28, 35, 18, 36, 42, 22, 44, 49, 26, 52, 56
db 36, 63, 63, 18, 04, 14, 24, 08, 21, 31, 12, 28, 37
db 16, 35, 44, 20, 42, 50, 24, 49, 57, 28, 56, 63, 38
db 63, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 53, 44, 22, 09
db 08, 12, 16, 14, 16, 22, 21, 20, 29, 27, 24, 35, 34
db 28, 42, 40, 32, 48, 47, 36, 57, 56, 43, 08, 12, 16
db 14, 16, 22, 21, 20, 29, 27, 24, 35, 34, 28, 42, 40
db 32, 48, 47, 36, 57, 56, 43, 63, 13, 09, 11, 21, 16
db 15, 27, 22, 18, 36, 29, 22, 42, 35, 25, 51, 42, 29
db 57, 48, 32, 63, 56, 39, 06, 14, 09, 12, 21, 14, 18
db 27, 22, 24, 33, 28, 30, 39, 36, 36, 46, 42, 42, 52
db 47, 50, 59, 53, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00
db 00
times 1024 - ($-$$) db 0 ;alignment
MMap:
times 4096 db 0
datasize equ $-$$
Photos of the computer screen after the program finished(from afar and a closeup with explanation):
As you can see, VGA 13h draws some weird stuff on the screen because previous graphics mode was not disabled. So, why does my code does not disable the current video mode of my intel graphics card(considering that the Base address of MMIO is alright)?.
Related
Assembly minimum, middle, max, sum, and integer average of a list of numbers
I need to create a simple x96-64 assembly language program to compute the min, middle value, max, sum and integer average of a list of numbers. When I try to assemble it gives me errors. My code so far: ; ----- ; Define constants. NULL equ 0 ; end of string TRUE equ 1 FALSE equ 0 EXIT_SUCCESS equ 0 ; successful operation SYS_exit equ 60 ; call code for terminate ; ----- lst dd 4220, -1116, 1542, 1240, 1677 dd -1635, 2426, 1820, 1246, -333 dd 2315, -215, 2726, 1140, 2565 dd 2871, 1614, 2418, 2513, 1422 dd -119, 1215, -1525, -712, 1441 dd -3622, -731, -1729, 1615, 2724 dd 1217, -224, 1580, 1147, 2324 dd 1425, 1816, 1262, -2718, 1192 dd -1435, 235, 2764, -1615, 1310 dd 1765, 1954, -967, 1515, 1556 dd 1342, 7321, 1556, 2727, 1227 dd -1927, 1382, 1465, 3955, 1435 dd -225, -2419, -2534, -1345, 2467 dd 1615, 1961, 1335, 2856, 2553 dd -1035, 1835, 1464, 1915, -1810 dd 1465, 1554, -267, 1615, 1656 dd 2192, -825, 1925, 2312, 1725 dd -2517, 1498, -670, 1475, 2030 dd 1223, 1883, -1173, 1350, 2415 dd -335, 1125, 1118, 1713, 3020 length dd 100 lstMin dd 0 lstMid dd 0 lstMax dd 0 lstSum dd 0 lstAve dd 0 evenCnt dd 0 evenSum dd 0 evenAve dd 0 tenCnt dd 0 tenSum dd 0 tenAve dd 0 ; ***************************************************************** section .text global _start _start: ; ---------------------------------------------- mov rcx, 0 mov ecx, dword [length] mov eax, dword[lst] mov dword, [lst +lstMin] eax mov dword, [lst +lstMax] eax mov rsi, 0 mov dword [lst +lstSum],0 sumLp: mov eax, dword [lst+rsi] add dword [lst + lstSum],eax cmp eax, dword [lst+lstMin] jge minDone mov dword[lstMin],eax minDone: cmp eax,dword [lstMax] jle maxDone mov dword[lstMax],eax maxDone: add rsi, 4 dec rcx cmp rcx, 0 jne sumLp mov eax, dword [lstSum] cdq idiv dword [length] mov dword[lstAve],eax ; ***************************************************************** ; Done, terminate program. last: mov eax, SYS_exit ; call call for exit (SYS_exit) mov ebx, EXIT_SUCCESS ; return code of 0 (no error) syscall The errors I get when assembling with YASM are similar to: myprog.asm:60: error: unexpected `,' after instruction myprog.asm:61: error: unexpected `,' after instruction Why am I getting these errors and how can I fix them?
all strings are printing on the console at the same time in assembly language 8086
I am having a problem with my assembly code I was making a linear search program and in that program i have initializes two strings; the string when user is asked to input a number and other when there is a positive result but the problem is that when i run my program, all strings are printing at the same time on the console. note: there are some other errors also but please ignore them at this time. include irvine32.inc .data searchArr WORD 1, 4, 7, 14, 299, 156, 3, 63, 29, 300, 20 user_input word ? search_str byte "enter the number you want to search",0ah,0dh yes_str byte "the number is in given array",0ah,0dh .code main proc mov edx,offset search_str call writestring mov edx,0 ; asking the number from user call readint mov user_input , ax mov eax,0 ;seacrhing the array mov ecx, (lengthof searchArr) mov esi,0 search: mov bx,searchArr[esi * type searchArr] cmp bx,user_input je yes inc esi loop search yes: mov edx,0 mov edx,offset yes_str call writestring exit main endp end main
Assembly: how to convert number into ascii and write to display buffer
I am new to assembly and am programming in linux 64 bit in AT&T syntax. If I store the number 1 in a register, how can I translate that to the ascii character "A"? For example: movl $1, %ebx addl $64, %ebx Can I add 64 to 1 to make 65 (the decimal value of A), then somehow convert it to "A" and send this to the buffer using write system call? EDIT 1: Posting my program code here. .section .data message: .long 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 length: .long 10 .section .text .globl _start _start: xorq %rdi, %rdi xorq %rax, %rax xorq %rbx, %rbx xorq %rcx, %rcx xorq %rdx, %rdx movl length, %edx loop: cmpl %ecx, %edx je loop_end movl message(,%rdi,4), %eax addl $64, %eax pushq %rax incq %rdi incq %rcx jmp loop loop_end: cmpq $0, %rcx je exit popq %rbx pushq %rcx movq $1, %rax movq $1, %rdi movq %rbx, %rsi movl length, %edx syscall popq %rcx decq %rcx jmp loop_end exit: movq $60, %rax movq $0, %rdi syscall
I'm not entirely familiar with AT&T syntax, but the disassembly of NASM in what you're accustomed to should suffice. You should try to avoid what is called hard coding constants as it makes your program harder to maintain, especially when it's hundreds if not thousands of lines in length. Therefore; section .data Values: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 26, 18, 12, 20, 19, 11 V_Size equ $ - Values is preferable to this message: .long 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 length: .long 10 What you did is not wrong, but the method is predicated upon you counting, not the assembler. As it has already been pointed out, use the smallest data size required to get the job done. In this case char is better than long This code in NASM section .text global _start _start: xor ecx, ecx push rcx ; Applications default return value mov cl, V_Size push rcx mov ebx, Values push rbx Next: or byte [ebx], 64 inc ebx loop Next pop rsi pop rdx pop rax inc al mov edi, eax syscall mov edi, eax dec edi mov eax, edi mov al, 60 syscall section .data Values: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 26, 18, 12, 20, 19, 11 V_Size equ $ - Values will yield ABCDEFGHIJZRLTSK with command prompt immediatly after "K". section .data: 6000d8 01020304 05060708 090a1a12 0c14130b section .text: <_start>: These two instructions are idiosyncratic to my style of programming and not essential to functionality of program. 4000b0: 31 c9 xor %ecx,%ecx 4000b2: 51 push %rcx Setup RCX & RBX for LOOP instruction 4000b3: b1 10 mov $0x10,%cl 4000b5: 51 push %rcx ARG2 to syscall 4000b6: bb d8 00 60 00 mov $0x6000d8,%ebx 4000bb: 53 push %rbx ARG1 to syscall <Next>: This conforms to the scope of your objective. 4000bc: 67 80 0b 40 orb $0x40,(%ebx) [ebx] += 'A' 4000c0: ff c3 inc %ebx 4000c2: e2 f8 loop 4000bc <Next> ssize_t write (int fd, const void *buf, size_t count); 4000c4: 5e pop %rsi ARG1 = ASCII Pntr 4000c5: 5a pop %rdx ARG2 = # of chars 4000c6: 58 pop %rax 4000c7: fe c0 inc %al SYS_WRITE 4000c9: 89 c7 mov %eax,%edi ARG0 = STD_OUT 4000cb: 0f 05 syscall Epilogue: Again, just a method I use. 4000cd: 89 c7 mov %eax,%edi 4000cf: ff cf dec %edi 4000d1: 89 f8 mov %edi,%eax 4000d3: b0 3c mov $0x3c,%al 4000d5: 0f 05 syscall
"The value of ESP was not properly saved across a function call." even with LEAVE
When attempting to call a method which is defined in assembly, I'm receiving the error "The value of ESP was not properly saved across a function call.", using Visual Studio 2012. Looking at other questions, a common factor was the mention that the assembly may not have the LEAVE instruction at the end of each label. I am receiving this error, but with the following code, which does include the LEAVE instruction. section .bss vs: resb 13 ; 12-byte vendor string + NULL char ns: resb 49 ; 48-byte proc. name + NULL char section .text global _meta_vendor global _meta_procname _meta_vendor: push ebp mov ebp, esp mov eax, 0h cpuid mov [vs], ebx mov [vs + 4], edx mov [vs + 8], ecx mov byte [vs + 12], 0h mov eax, vs leave ret _meta_procname: push ebp mov ebp, esp mov eax, 80000002h cpuid mov [ns], eax mov [ns + 4], ebx mov [ns + 8], ecx mov [ns + 12], edx mov eax, 80000003h cpuid mov [ns + 16], eax mov [ns + 20], ebx mov [ns + 24], ecx mov [ns + 28], edx mov eax, 80000004h cpuid mov [ns + 32], eax mov [ns + 36], ebx mov [ns + 40], ecx mov [ns + 44], edx mov byte [ns + 48], 0h mov eax, ns leave ret Function prototypes for those labels are then in a header file which contains only this: #include <cstdint> extern "C" { char* meta_vendor(); char* meta_procname(); } Any insight as to why I am receiving this error? Note that, if I click "Continue" on the popup which appears, the expected values do appear.
The problem was solved by adding push ebx and pop ebx as is shown below: _meta_vendor: push ebp mov ebp, esp push ebx ; code... pop ebx leave ret
Faster assembly optimized way to convert between 8-bit grayscale and RGB32 image with SSE
I'm trying to find an optimized method for RGB8 (actually grayscale) to RGB32 image conversion. Source is an 8 bits grey image, Destination should be an 32 bits grey image (BGRA) with 4th channel (alpha) to be ignored. Source address is not guaranteed to be 16 byte aligned, Count is a multiple of 16, Destination address is 16 byte aligned. INPUT: 8 bits single channel grey image OUTPUT: 32 bits BGRA (alpha channel ignored) COUNT: Image size is a multiple of 16 CPU: x86-32 (SSE2/SSE3 allowed) Here is my optimized assembly code. Is there an even faster way of conversion? void ConvertGreyToRgb32Assembler(__m128i* Source, __m128i* Destination, unsigned int Count) { static unsigned int __declspec(align(64)) Masks[] = { 0x80000000, 0x80010101, 0x80020202, 0x80030303, 0x80040404, 0x80050505, 0x80060606, 0x80070707, 0x80080808, 0x80090909, 0x800a0a0a, 0x800b0b0b, 0x800c0c0c, 0x800d0d0d, 0x800e0e0e, 0x800f0f0f }; __asm { mov esi, Source mov edi, Destination mov edx, Count xor ecx, ecx movdqa xmm4, xmmword ptr [Masks + 0] movdqa xmm5, xmmword ptr [Masks + 16] movdqa xmm6, xmmword ptr [Masks + 32] movdqa xmm7, xmmword ptr [Masks + 48] l1: movdqu xmm0, xmmword ptr [esi + ecx] movdqa xmm1, xmm0 movdqa xmm2, xmm0 movdqa xmm3, xmm0 pshufb xmm0, xmm4 pshufb xmm1, xmm5 pshufb xmm2, xmm6 pshufb xmm3, xmm7 movntdq [edi + 0], xmm0 movntdq [edi + 16], xmm1 movntdq [edi + 32], xmm2 movntdq [edi + 48], xmm3 add edi, 64 add ecx, 16 cmp ecx, edx jb l1 } } There is another approach using several PUNPCKLBW and PUNPCKHBW but that seems to be slightly slower. Update: This is the basic non optimized algorithm: BGRA* Destination = ... unsigned char* Source ... for (unsigned int i = 0; i < Size; i++) { Destination[i].Blue = Source[i]; Destination[i].Green = Source[i]; Destination[i].Red = Source[i]; } PS: I also tried using C code with MS VS2008 SSE compiler intrinsics. It turned out that the compiler generated a lot of unnecessary memory moves which causes the code to be 10-20% slower than pure assembly. Update 2: This is the same code by using intrinsics only. void ConvertGreyToRgb32Assembler(__m128i* Source, __m128i* Destination, unsigned int Count) { static const unsigned int __declspec(align(64)) Masks[] = { 0x80000000, 0x80010101, 0x80020202, 0x80030303, 0x80040404, 0x80050505, 0x80060606, 0x80070707, 0x80080808, 0x80090909, 0x800a0a0a, 0x800b0b0b, 0x800c0c0c, 0x800d0d0d, 0x800e0e0e, 0x800f0f0f }; register __m128i m0 = _mm_load_si128((__m128i*) (Masks + 0)); register __m128i m1 = _mm_load_si128((__m128i*) (Masks + 4)); register __m128i m2 = _mm_load_si128((__m128i*) (Masks + 8)); register __m128i m3 = _mm_load_si128((__m128i*) (Masks + 12)); for (unsigned int i = 0; i < Count / 16; i++) { __m128i r0 = _mm_load_si128(Source + i); _mm_stream_si128(Destination + (i * 4) + 0, _mm_shuffle_epi8(r0, m0)); _mm_stream_si128(Destination + (i * 4) + 1, _mm_shuffle_epi8(r0, m1)); _mm_stream_si128(Destination + (i * 4) + 2, _mm_shuffle_epi8(r0, m2)); _mm_stream_si128(Destination + (i * 4) + 3, _mm_shuffle_epi8(r0, m3)); } } Update 3: This is the compiler generated code (beautified) (Visual Studio 2012, all optimization on): push ebp mov ebp, esp mov edx, dword ptr [ebp+8] movdqa xmm1, xmmword ptr ds:[Masks + 0] movdqa xmm2, xmmword ptr ds:[Masks + 16] movdqa xmm3, xmmword ptr ds:[Masks + 32] movdqa xmm4, xmmword ptr ds:[Masks + 48] push esi test ecx, ecx je l2 lea esi, [ecx-1] shr esi, 4 inc esi l1: mov ecx, edx movdqu xmm0, xmmword ptr [ecx] mov ecx, eax movdqa xmm5, xmm0 pshufb xmm5, xmm1 movdqa xmmword ptr [ecx], xmm5 movdqa xmm5, xmm0 pshufb xmm5, xmm2 movdqa xmmword ptr [eax+10h], xmm5 movdqa xmm5, xmm0 pshufb xmm5, xmm3 movdqa xmmword ptr [eax+20h], xmm5 lea ecx, [eax+30h] add edx, 10h add eax, 40h dec esi pshufb xmm0, xmm4 movdqa xmmword ptr [ecx], xmm0 jne l1 l2: pop esi pop ebp ret It seems that interleaving movdqa with pshufb is some what faster. Update 4: This seems to be the optimal hand optimized code: __asm { mov esi, Source mov edi, Destination mov ecx, Count movdqu xmm0, xmmword ptr [esi] movdqa xmm4, xmmword ptr [Masks + 0] movdqa xmm5, xmmword ptr [Masks + 16] movdqa xmm6, xmmword ptr [Masks + 32] movdqa xmm7, xmmword ptr [Masks + 48] l1: dec ecx lea edi, [ edi + 64 ] lea esi, [ esi + 16 ] movdqa xmm1, xmm0 movdqa xmm2, xmm0 movdqa xmm3, xmm0 pshufb xmm0, xmm4 movdqa [edi - 64], xmm0 pshufb xmm1, xmm5 movdqa [edi - 48], xmm1 pshufb xmm2, xmm6 movdqa [edi - 32], xmm2 pshufb xmm3, xmm7 movdqa [edi - 16], xmm3 movdqu xmm0, xmmword ptr [esi] ja l1 } Update 5: This conversion algorithm uses punpck instruction. However this conversion routine is a bit slower than using masks and pushfb. for (unsigned int i = 0; i < Count; i += 16) { register __m128i r0 = _mm_load_si128(Source++); register __m128i r1 = _mm_unpackhi_epi8(r0, r0); register __m128i r2 = _mm_unpacklo_epi8(r0, r0); register __m128i r3 = _mm_unpackhi_epi8(r1, r1); register __m128i r4 = _mm_unpacklo_epi8(r1, r1); register __m128i r5 = _mm_unpackhi_epi8(r2, r2); register __m128i r6 = _mm_unpacklo_epi8(r2, r2); _mm_store_si128(Destination++, r6); _mm_store_si128(Destination++, r5); _mm_store_si128(Destination++, r4); _mm_store_si128(Destination++, r3); } Update 6: For the sake of completeness this is the inverse method to convert from 32 bits back to 8 bits grey image. static void ConvertRgb32ToGrey(const __m128i* Source, __m128i* Destination, unsigned int Count) { static const unsigned char __declspec(align(64)) Masks[] = { 0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c, }; register __m128i m0 = _mm_load_si128((__m128i*) (Masks + 0)); register __m128i m1 = _mm_load_si128((__m128i*) (Masks + 16)); register __m128i m2 = _mm_load_si128((__m128i*) (Masks + 32)); register __m128i m3 = _mm_load_si128((__m128i*) (Masks + 48)); for (unsigned int i = 0; i < Count / 64; i++) { __m128i a = _mm_load_si128(Source + (i * 4) + 0); __m128i b = _mm_load_si128(Source + (i * 4) + 1); __m128i c = _mm_load_si128(Source + (i * 4) + 2); __m128i d = _mm_load_si128(Source + (i * 4) + 3); a = _mm_shuffle_epi8(a, m0); b = _mm_shuffle_epi8(b, m1); c = _mm_shuffle_epi8(c, m2); d = _mm_shuffle_epi8(d, m3); __m128i e = _mm_or_si128(a, b); __m128i f = _mm_or_si128(c, d); __m128i g = _mm_or_si128(e, f); _mm_stream_si128(Destination + i, g); } }
Would try: __asm { mov esi, Source mov edi, Destination mov ecx, Count movdqu xmm0, xmmword ptr [esi] movdqa xmm4, xmmword ptr [Masks + 0] movdqa xmm5, xmmword ptr [Masks + 16] movdqa xmm6, xmmword ptr [Masks + 32] movdqa xmm7, xmmword ptr [Masks + 48] l1: dec ecx // modern Intel can macro-fuse this with jnz if adjacent lea edi, [ edi + 64 ] lea esi, [ esi + 16 ] movdqa xmm1, xmm0 movdqa xmm2, xmm0 movdqa xmm3, xmm0 pshufb xmm0, xmm4 pshufb xmm1, xmm5 pshufb xmm2, xmm6 pshufb xmm3, xmm7 movntdq [edi - 64], xmm0 movntdq [edi - 48], xmm1 movntdq [edi - 32], xmm2 movntdq [edi - 16], xmm3 movdqu xmm0, xmmword ptr [esi] jnz l1 } Haven't benchmarked it though; assumptions behind these changes: the movdqu xmm0,... latency can be a little more hidden within the loop (your code has the load of xmm0 followed directly by an instruction using the value in that register) the add ops on two regs as well as the cmp aren't really all necessary; address generation (lea) and the implicit zero test by dec/jnz can be used. That way, there'll be no EFLAGS dependencies caused by operations on ecx/esi/edi as the only ALU op in the loop is decrementing the loop counter. In the end, this is likely load/store bound in any case so the arithmetics are "free game"; I therefore expect little difference, even with the arguments as given. If the input is large, then it'd make sense to strip the "unaligned head/tail" off, i.e. to do a duff's device for the first/last [0..15] bytes, and the main loop using movdqa. Edit: Running your intrinsics sources through gcc -msse4.2 -O8 -c (GCC 4.7.1) gives the following assembly: Disassembly of section .text: 0000000000000000 <ConvertGreyToRgb32Assembler>: 0: 85 d2 test edx,edx 2: 74 76 je 7a <ConvertGreyToRgb32Assembler+0x7a> 4: 66 0f 6f 2d 00 00 00 00 movdqa xmm5,XMMWORD PTR [rip+0x0] # c <ConvertGreyToRgb32Assembler+0xc> c: 48 89 f8 mov rax,rdi f: 66 0f 6f 25 00 00 00 00 movdqa xmm4,XMMWORD PTR [rip+0x0] # 17 <ConvertGreyToRgb32Assembler+0x17> 17: 66 0f 6f 1d 00 00 00 00 movdqa xmm3,XMMWORD PTR [rip+0x0] # 1f <ConvertGreyToRgb32Assembler+0x1f> 1f: 66 0f 6f 15 00 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0] # 27 <ConvertGreyToRgb32Assembler+0x27> 27: 66 0f 1f 84 00 00 00 00 00 nop WORD PTR [rax+rax*1+0x0] 30: f3 0f 6f 00 movdqu xmm0,XMMWORD PTR [rax] 34: 48 89 f1 mov rcx,rsi 37: 48 83 c0 10 add rax,0x10 3b: 66 0f 6f c8 movdqa xmm1,xmm0 3f: 66 0f 38 00 cd pshufb xmm1,xmm5 44: 66 0f e7 0e movntdq XMMWORD PTR [rsi],xmm1 48: 66 0f 6f c8 movdqa xmm1,xmm0 4c: 66 0f 38 00 cc pshufb xmm1,xmm4 51: 66 0f e7 4e 10 movntdq XMMWORD PTR [rsi+0x10],xmm1 56: 66 0f 6f c8 movdqa xmm1,xmm0 5a: 66 0f 38 00 c2 pshufb xmm0,xmm2 5f: 66 0f 38 00 cb pshufb xmm1,xmm3 64: 66 0f e7 4e 20 movntdq XMMWORD PTR [rsi+0x20],xmm1 69: 66 0f e7 41 30 movntdq XMMWORD PTR [rcx+0x30],xmm0 6e: 89 c1 mov ecx,eax 70: 29 f9 sub ecx,edi 72: 48 83 c6 40 add rsi,0x40 76: 39 ca cmp edx,ecx 78: 77 b6 ja 30 <ConvertGreyToRgb32Assembler+0x30> 7a: f3 c3 repz ret This reminds me extremely strongly of your initial assembly code. If MSVC creates something significantly worse than that, I'd say it's a bug/limitation in the compiler (version) you used.