VC inline asm - shifting with rcl (rcr) - visual-c++

I'm using the following function to shift the data of a vector by 1 bit (mult by 2):
vec shl(vec n) {
n.resize(n.size() + 1, 0);
unsigned int* adr = n.data();
unsigned int s = n.size();
_asm {
clc
mov ecx, 0
mov edx, dword ptr [adr]
nloop:
mov eax, dword ptr[edx + ecx * 4]
rcl eax, 1
mov dword ptr [edx + ecx * 4], eax
inc ecx
jc carryisset ; check carry - breakpoint
jmp nocarry ; ~ breakpoint
carryisset : ; ~ breakpoint
jmp nocarry ; ~ breakpoint
nocarry: ; ~ breakpoint
cmp ecx, dword ptr [s]
jl nloop
};
return n;
};
So, I've read that rcl uses the carry bit and add it to the high bit. But when the carry bit is not set according to the debugger, the rcl continues adding it to eax.
For example:
#include <iostream>
#include <vector>
typedef std::vector<unsigned int> vec;
const unsigned int uint_max = (unsigned int)(~0);
vec n1 = { uint_max, 2, 2, 1, 0, 0, 0 };
vec n2
int main() {
n2 = shl(n1);
for (auto i : n2)
std::cout << i << " ";
return 0;
};
Output:
4294967294 5 5 3 1 1 1 1
Stepping through the code with debugger:
loop: first iteration (ecx = 0)
eax <- uint_max
eax <- rotate left with carry (rcl)
now eax is uint_max - 1
jumps to carryisset (with jc), so there is a carry
loop: second iteration (ecx = 1)
eax <- 2
eax <- rotate left with carry (rcl)
now eax is 2 << 2 + (carry)1 = 5
jumps to nocarry (with jc), so there is no carry
loop: third iteration (ecx = 2)
eax <- 2
eax <- rotate left with carry (rcl)
now eax is 2 << 2 + carry (should be 0, not set), but eax gives 5 too, like there were carry.
jumps to nocarry (with jc), so there is no carry (at least according to jc)
...ect
So, there is no carry after the first iteration in this case, but the carry does not 'reset'.
This implementation came from an SO post Large binary shifts in 8086 assembly? (accepted answer):
First, make sure the carry flag is zero. Then:
1. Pull 4 bytes into a register
2. RCR - in my case RCL
3. Write back out
4. Repeat with the next 4 bytes
However the carry bit is always on when I rotate left (or tried with right, same results: in case of vec(2,0,0,0,0...) it is vec(1, uint_max/2 + 1, uint max/2 + 1, ...))
ps: I made a working shift avoiding the carry and checking the highest bit, but it is a overcomplicated I think:
_asm {
clc
mov edx, dword ptr [adr]
xor ebx, ebx
xor ecx, ecx
xor eax, eax
nloop:
mov eax, dword ptr[edx + ecx * 4]
push edx
mov edx, ebx
mov ebx, eax
and ebx, 0x80000000
shr ebx, 31
shl eax, 1
add eax, edx
pop edx
mov dword ptr [edx + ecx * 4], eax
inc ecx
xor eax, eax
cmp ecx, dword ptr [s]
jl nloop
};
What is the problem with the first code, how to use rcl and rcr for shifting?

(Thanks to Hans. See the comment.)
The working code:
clc
mov ecx, 0
mov edx, dword ptr[adr]
nloop:
pushf
cmp ecx, dword ptr [s]
je fin
popf
mov eax, dword ptr[edx + ecx * 4]
rcl eax, 1
mov dword ptr[edx + ecx * 4], eax
inc ecx
jmp nloop
fin:
popf
I clear the flags first. In the main loop, pushf the flags just for a cmp, after popf them. For this, I moved the compare to the begin of the loop. For the fin just popf the the flags after the jump to avoid ESP errors.

Related

How to find characters in a string Assembly x86?

I'm trying to rewrite the C code below in Assembly x86
int myFn( char * v, char c, int size ) {
int i;
for(i=0; i < size; i++ )
if( v[i] == c )
return i;
return -1;
}
I've tried to use this code in x86:
myFn:
mov esi, 0
mov ebx, [esp + 8]
mov ecx, [esp + 12]
FOR:
mov eax, -1
cmp esi, [esp + 4]
jge ENDFOR
cmp [ecx + esi], ebx
je EQUAL
inc esi
jmp FOR
EQUAL:
mov eax, [esi]
ENDFOR:
ret
I've also created this program to test the function:
section .data
fmt: db "strfind: %d", 10, 0
str: db "test", 0
section .text
global main
extern printf
main:
mov eax, 's'
mov ebx, 4
push str
push eax
push ebx
call myFn
add esp, 12
push eax
push fmt
call printf
add esp, 8
ret
myFn:
mov esi, 0
mov ebx, [esp + 8]
mov ecx, [esp + 12]
FOR:
mov eax, -1
cmp esi, [esp + 4]
jge ENDFOR
cmp [ecx + esi], ebx
je EQUAL
inc esi
jmp FOR
EQUAL:
mov eax, [esi]
ENDFOR:
ret
I'm getting Segmentation Fault error or the wrong result when trying to test it. I believe the problem is when comparing the character of the string with the character I want to find

Insertion sort not swapping

I have to implement an insertion sort algorithm in x86 and my code doesn't change the output of the array at all. I think that the problem lies where I am trying to swap in my inner loop but whenever I change how the array elements get assigned nothing happens. I get no change in anything that the program outputs. Why is this happening, and how can I fix it?
My code is:
void asmSort(int *list, int arrayLen, int halfpoint) {
/*
* list = address of the list of integer array
* arraylen = the number of element in the list just like list.length in java
* halfpoint use as a flag
* halpfpoint = 1 when the sort routine reach half point just return, otherwise finished the sort and return
*/
/*
*
*
insertion_sort(list,arrayLen,halfpoint);
return;
selection_sort(list,arrayLen,halfpoint);
return;
*
*
*/
// any variable can be declare here before _asm
/*
int tmp = 0;
int i = 0;
int j = 0;
*/
_asm
{
mov ecx, arrayLen
mov esi, list
mov ebx, halfpoint
mov eax, 99
push eax
push ebp
mov ebp, 4 //this is i
shl ecx, 2
outerLoop:
cmp ebp, ecx
jg exitOuter
add esi,ebp
mov edi,[esi]// temp = a[i]
mov eax, ebp //j = i
sub eax, 4 // j = j-1
innerLoop :
cmp eax, 0 //j>0
jle exitInner
add esi, eax // offset array to a[j]
mov edx, [esi] // move a[j] to edx
cmp edi, edx // temp < a[j]
jle exitInner
push eax
mov eax,[esi]
add esi,4
mov esi,edi
pop eax
sub eax,4 // j--
jmp innerLoop
exitInner:
shr ecx, 1
cmp ebp, ecx
je exitOuter
sub esi,ebp
add ebp, 4//i++
jmp outerLoop
exitOuter :
sub esi, ebp
pop ebp
pop eax
; .......
more: cmp ecx,0
jle done
;.........
mov edx,arrayLen
sar edx,1
cmp ecx,edx
jg cont1
cmp halfpoint,1
je done
cont1: ;.....
;......
;.......
;.....
mov [esi],eax
add esi,4
dec ecx
jmp more
done:
}
return;
}
You never write to memory. The problem is here:
mov eax,[esi]
add esi,4
mov esi,edi
You want to write to memory at ESI, not to register ESI.
mov eax,[esi]
add esi,4
mov [esi],edi

(VC++) Runtime Check for Uninitialized Variables: How is the test Implemented?

I'm trying to understand what this test does exactly. This toy code
int _tmain(int argc, _TCHAR* argv[])
{
int i;
printf("%d", i);
return 0;
}
Compiles into this:
int _tmain(int argc, _TCHAR* argv[])
{
012C2DF0 push ebp
012C2DF1 mov ebp,esp
012C2DF3 sub esp,0D8h
012C2DF9 push ebx
012C2DFA push esi
012C2DFB push edi
012C2DFC lea edi,[ebp-0D8h]
012C2E02 mov ecx,36h
012C2E07 mov eax,0CCCCCCCCh
012C2E0C rep stos dword ptr es:[edi]
012C2E0E mov byte ptr [ebp-0D1h],0
int i;
printf("%d", i);
012C2E15 cmp byte ptr [ebp-0D1h],0
012C2E1C jne wmain+3Bh (012C2E2Bh)
012C2E1E push 12C2E5Ch
012C2E23 call __RTC_UninitUse (012C10B9h)
012C2E28 add esp,4
012C2E2B mov esi,esp
012C2E2D mov eax,dword ptr [i]
012C2E30 push eax
012C2E31 push 12C5858h
012C2E36 call dword ptr ds:[12C9114h]
012C2E3C add esp,8
012C2E3F cmp esi,esp
012C2E41 call __RTC_CheckEsp (012C1140h)
return 0;
012C2E46 xor eax,eax
}
012C2E48 pop edi
012C2E49 pop esi
012C2E4A pop ebx
012C2E4B add esp,0D8h
012C2E51 cmp ebp,esp
012C2E53 call __RTC_CheckEsp (012C1140h)
012C2E58 mov esp,ebp
012C2E5A pop ebp
012C2E5B ret
The 5 lines emphasized are the only ones removed by properly initializing the variable i. The lines 'push 12C2E5Ch, call __RTC_UninitUse' call the function that display the error box, with a pointer to a string containing the variable name ("i") as an argument.
What I can't understand are the 3 lines that perform the actual test:
012C2E0E mov byte ptr [ebp-0D1h],0
012C2E15 cmp byte ptr [ebp-0D1h],0
012C2E1C jne wmain+3Bh (012C2E2Bh)
It would have seemed the compiler is probing the stack area of i (setting a byte to zero and immediately testing whether it's zero), just to be sure it isn't initialized somewhere it couldn't see during build. However, the probed address, ebp-0D1h, has little to do with the actual address of i.
Even worse, it seems if there were such an external (other thread?) initialization that did initialize the probed address but to zero, this test would still shout about the variable being uninitialized.
What's going on? Maybe the probe is meant for something entirely different, say to test if a certain byte is writable?
[ebp-0D1h] is a temporary variable used by the compiler to track "initialized" status of variables. If we modify the source a bit, it will be more clear:
int _tmain(int argc, _TCHAR* argv[])
{
int i, j;
printf("%d %d", i, j);
i = 1;
printf("%d %d", i, j);
j = 2;
return 0;
}
Produces the following (irrelevant parts skipped):
mov DWORD PTR [ebp-12], -858993460 ; ccccccccH
mov DWORD PTR [ebp-8], -858993460 ; ccccccccH
mov DWORD PTR [ebp-4], -858993460 ; ccccccccH
mov BYTE PTR $T4694[ebp], 0
mov BYTE PTR $T4693[ebp], 0
In prolog, variables are filled with 0xCC, and two tracking variables (one for i and one for j) are set to 0.
; 7 : printf("%d %d", i, j);
cmp BYTE PTR $T4693[ebp], 0
jne SHORT $LN3#main
push OFFSET $LN4#main
call __RTC_UninitUse
add esp, 4
$LN3#main:
cmp BYTE PTR $T4694[ebp], 0
jne SHORT $LN5#main
push OFFSET $LN6#main
call __RTC_UninitUse
add esp, 4
$LN5#main:
mov eax, DWORD PTR _j$[ebp]
push eax
mov ecx, DWORD PTR _i$[ebp]
push ecx
push OFFSET $SG4678
call _printf
add esp, 12 ; 0000000cH
This corresponds roughly to:
if ( $T4693 == 0 )
_RTC_UninitUse("j");
if ( $T4694 == 0 )
_RTC_UninitUse("j");
printf("%d %d", i, j);
Next part:
; 8 : i = 1;
mov BYTE PTR $T4694[ebp], 1
mov DWORD PTR _i$[ebp], 1
So, once i is intialized, the tracking variable is set to 1.
; 10 : j = 2;
mov BYTE PTR $T4693[ebp], 1
mov DWORD PTR _j$[ebp], 2
Here, the same is happening for j.
Here is my guess: the compiler probably allocates flags in memory showing the initialization status of variables. In your case for variable i this is a single byte at [ebp-0D1h]. The zeroing of this byte means i is not initialized. I assume if you initialize i this byte will be set to non-zero. Try something run-time like this: if (argc > 1) i = 1; This should generate code instead of omitting the whole check. You can also add another variable, and see if you get two different flags.
The zeroing of the flag and the testing just happen to be consecutive in this case, but that might not always be the case.
C7060F000055 mov dword ptr [esi],5500000Fh
C746048BEC5151 mov dword ptr [esi+0004],5151EC8Bh
b. And one of its later generations:
BF0F000055 mov edi,5500000Fh
893E mov [esi],edi
5F pop edi
52 push edx
B640 mov dh,40
BA8BEC5151 mov edx,5151EC8Bh
53 push ebx
8BDA mov ebx,edx
895E04 mov [esi+0004],ebx
c. And yet another generation with recalculated ("encrypted") "constant" data:
BB0F000055 mov ebx,5500000Fh
891E mov [esi],ebx
5B pop ebx
51 push ecx
B9CB00C05F mov ecx,5FC000CBh
81C1C0EB91F1 add ecx,F191EBC0h ; ecx=5151EC8Bh

IDA PRO Struct Pointer Counter big number not starting from address offset 0, Lowers a bit slightly but not completely to 0

I put the whole question in 3 images from research it seems I need to use CTRL+R but I don't think that's what I need since I could lower the number a bit lower just can't lower it to the proper amount of 0.
I think the problem is I'm not creating the structs properly probably missing something.
ASM Code:
.text:0040E040 ; =============== S U B R O U T I N E =======================================
.text:0040E040
.text:0040E040
.text:0040E040 ; struct_ARENA *__thiscall code(struct_PLAYER *player, const void *buf, unsigned int len, int a4)
.text:0040E040 sub_40E040 proc near
.text:0040E040
.text:0040E040
.text:0040E040 buf = dword ptr 4
.text:0040E040 len = dword ptr 8
.text:0040E040 a4 = dword ptr 0Ch
.text:0040E040
.text:0040E040 push ebx
.text:0040E041 push esi
.text:0040E042 mov esi, ecx
.text:0040E044 mov eax, [esi+1Ch]
.text:0040E047 test eax, eax
.text:0040E049 jz short loc_40E093
.text:0040E04B mov ecx, [eax+0FF0Ch]
.text:0040E051 xor ebx, ebx
.text:0040E053 test ecx, ecx
.text:0040E055 jle short loc_40E093
.text:0040E057 push edi
.text:0040E058 push ebp
.text:0040E059 mov ebp, [esp+10h+a4]
.text:0040E05D mov edi, 0FB20h
.text:0040E062
.text:0040E062 loc_40E062:
.text:0040E062 mov eax, [edi+eax]
.text:0040E065 cmp eax, esi
.text:0040E067 jz short loc_40E082
.text:0040E069 mov ecx, [eax+38h]
.text:0040E06C test ecx, ecx
.text:0040E06E jnz short loc_40E082
.text:0040E070 mov ecx, [esp+10h+len]
.text:0040E074 mov edx, [esp+10h+buf]
.text:0040E078 push ebp ; a4
.text:0040E079 push ecx ; len
.text:0040E07A push edx ; buf
.text:0040E07B mov ecx, eax ; this
.text:0040E07D call SendPlayerReliablePacket
.text:0040E082
.text:0040E082 loc_40E082:
.text:0040E082
.text:0040E082 mov eax, [esi+1Ch]
.text:0040E085 inc ebx
.text:0040E086 add edi, 4
.text:0040E089 cmp ebx, [eax+0FF0Ch]
.text:0040E08F jl short loc_40E062
.text:0040E091 pop ebp
.text:0040E092 pop edi
.text:0040E093
.text:0040E093 loc_40E093:
.text:0040E093
.text:0040E093 pop esi
.text:0040E094 pop ebx
.text:0040E095 retn 0Ch
.text:0040E095 sub_40E040 endp
.text:0040E095 ; ---------------------------------------------------------------------------
.text:0040E098 align 10h
Here is one that looks better only 1 struct instead of 2 but still same problem

Printing an Int (or Int to String)

I am looking for a way to print an integer in assembler (the compiler I am using is NASM on Linux), however, after doing some research, I have not been able to find a truly viable solution. I was able to find a description for a basic algorithm to serve this purpose, and based on that I developed this code:
global _start
section .bss
digit: resb 16
count: resb 16
i: resb 16
section .data
section .text
_start:
mov dword[i], 108eh ; i = 4238
mov dword[count], 1
L01:
mov eax, dword[i]
cdq
mov ecx, 0Ah
div ecx
mov dword[digit], edx
add dword[digit], 30h ; add 48 to digit to make it an ASCII char
call write_digit
inc dword[count]
mov eax, dword[i]
cdq
mov ecx, 0Ah
div ecx
mov dword[i], eax
cmp dword[i], 0Ah
jg L01
add dword[i], 48 ; add 48 to i to make it an ASCII char
mov eax, 4 ; system call #4 = sys_write
mov ebx, 1 ; file descriptor 1 = stdout
mov ecx, i ; store *address* of i into ecx
mov edx, 16 ; byte size of 16
int 80h
jmp exit
exit:
mov eax, 01h ; exit()
xor ebx, ebx ; errno
int 80h
write_digit:
mov eax, 4 ; system call #4 = sys_write
mov ebx, 1 ; file descriptor 1 = stdout
mov ecx, digit ; store *address* of digit into ecx
mov edx, 16 ; byte size of 16
int 80h
ret
C# version of what I want to achieve (for clarity):
static string int2string(int i)
{
Stack<char> stack = new Stack<char>();
string s = "";
do
{
stack.Push((char)((i % 10) + 48));
i = i / 10;
} while (i > 10);
stack.Push((char)(i + 48));
foreach (char c in stack)
{
s += c;
}
return s;
}
The issue is that it outputs the characters in reverse, so for 4238, the output is 8324. At first, I thought that I could use the x86 stack to solve this problem, push the digits in, and pop them out and print them at the end, however when I tried implementing that feature, it flopped and I could no longer get an output.
As a result, I am a little bit perplexed about how I can implement a stack in to this algorithm in order to accomplish my goal, aka printing an integer. I would also be interested in a simpler/better solution if one is available (as it's one of my first assembler programs).
One approach is to use recursion. In this case you divide the number by 10 (getting a quotient and a remainder) and then call yourself with the quotient as the number to display; and then display the digit corresponding to the remainder.
An example of this would be:
;Input
; eax = number to display
section .data
const10: dd 10
section .text
printNumber:
push eax
push edx
xor edx,edx ;edx:eax = number
div dword [const10] ;eax = quotient, edx = remainder
test eax,eax ;Is quotient zero?
je .l1 ; yes, don't display it
call printNumber ;Display the quotient
.l1:
lea eax,[edx+'0']
call printCharacter ;Display the remainder
pop edx
pop eax
ret
Another approach is to avoid recursion by changing the divisor. An example of this would be:
;Input
; eax = number to display
section .data
divisorTable:
dd 1000000000
dd 100000000
dd 10000000
dd 1000000
dd 100000
dd 10000
dd 1000
dd 100
dd 10
dd 1
dd 0
section .text
printNumber:
push eax
push ebx
push edx
mov ebx,divisorTable
.nextDigit:
xor edx,edx ;edx:eax = number
div dword [ebx] ;eax = quotient, edx = remainder
add eax,'0'
call printCharacter ;Display the quotient
mov eax,edx ;eax = remainder
add ebx,4 ;ebx = address of next divisor
cmp dword [ebx],0 ;Have all divisors been done?
jne .nextDigit
pop edx
pop ebx
pop eax
ret
This example doesn't suppress leading zeros, but that would be easy to add.
I think that maybe implementing a stack is not the best way to do this (and I really think you could figure out how to do that, saying as how pop is just a mov and a decrement of sp, so you can really set up a stack anywhere you like by just allocating memory for it and setting one of your registers as your new 'stack pointer').
I think this code could be made clearer and more modular if you actually allocated memory for a c-style null delimited string, then create a function to convert the int to string, by the same algorithm you use, then pass the result to another function capable of printing those strings. It will avoid some of the spaghetti code syndrome you are suffering from, and fix your problem to boot. If you want me to demonstrate, just ask, but if you wrote the thing above, I think you can figure out how with the more split up process.
; Input
; EAX = pointer to the int to convert
; EDI = address of the result
; Output:
; None
int_to_string:
xor ebx, ebx ; clear the ebx, I will use as counter for stack pushes
.push_chars:
xor edx, edx ; clear edx
mov ecx, 10 ; ecx is divisor, devide by 10
div ecx ; devide edx by ecx, result in eax remainder in edx
add edx, 0x30 ; add 0x30 to edx convert int => ascii
push edx ; push result to stack
inc ebx ; increment my stack push counter
test eax, eax ; is eax 0?
jnz .push_chars ; if eax not 0 repeat
.pop_chars:
pop eax ; pop result from stack into eax
stosb ; store contents of eax in at the address of num which is in EDI
dec ebx ; decrement my stack push counter
cmp ebx, 0 ; check if stack push counter is 0
jg .pop_chars ; not 0 repeat
mov eax, 0x0a
stosb ; add line feed
ret ; return to main
; eax = number to stringify/output
; edi = location of buffer
intToString:
push edx
push ecx
push edi
push ebp
mov ebp, esp
mov ecx, 10
.pushDigits:
xor edx, edx ; zero-extend eax
div ecx ; divide by 10; now edx = next digit
add edx, 30h ; decimal value + 30h => ascii digit
push edx ; push the whole dword, cause that's how x86 rolls
test eax, eax ; leading zeros suck
jnz .pushDigits
.popDigits:
pop eax
stosb ; don't write the whole dword, just the low byte
cmp esp, ebp ; if esp==ebp, we've popped all the digits
jne .popDigits
xor eax, eax ; add trailing nul
stosb
mov eax, edi
pop ebp
pop edi
pop ecx
pop edx
sub eax, edi ; return number of bytes written
ret

Resources