Why am I getting unexpected results when getting and displaying a character from a string? - string

I'm trying to get the second character of a string (eg e in Test). Using emu8086 to compile.
When I do:
str db 'Test$'
...
mov si, 1 ; get second character of str
mov bl, str[si] ; store the second character
mov ah, 2 ; display the stored character
mov dl, bl
int 21h
The output is e.
But when I do:
str db 25
db ?
db 25 dup (?)
...
mov ah, 0ah ; accept a string
lea dx, str ; store input in variable str
int 21h
mov si, 1 ; get second character of str (??)
mov bl, str[si] ; store the second character
mov ah, 2 ; display the stored character
mov dl, bl
int 21h
I get ♦.
When I change the second snippet's "get second character of str" portion to this:
mov si, 3 ; get second character of str (why is it '3' instead of '1'?)
mov bl, str[si] ; store the second character
I get e.
I don't understand. While it works in the first snippet, why, in the second snippet, do I have set SI to 3 instead of 1, if I'm trying to reference the second character of the string? Or is the method I'm using misled?

str[si] is not some kind of type/array access, but it will translate into instruction memory operand like [si+1234], where "1234" is offset, where the label str points to in memory.
And in your second example the str label points at byte with value 25 (max length of buffer), then str+1 points at returned input length byte (that's the ♦ value you get on output, if you try to print it out as character), and str+2 points at first character of user input. So to get second character you must use str+3 memory address.
The memory is addressable by bytes, so you either have to be aware of byte-size of all elements, or use more labels, like:
str_int_0a: ; label to the beginning of structure for "0a" DOS service
db 25
db ?
str: ; label to the beginning of raw input buffer (first char)
db 25 dup (?)
Then in the code you use the correct label depending on what you want to do:
...
mov ah, 0ah ; accept a string
lea dx, str_int_0a ; store input in memory at address str
int 21h
mov si, 1 ; index of second character of str
mov bl, str[si] ; load the second character
mov ah, 2 ; display the stored character
mov dl, bl
int 21h
...
You should use some debugger and observe values in memory, and registers, and assembled instructions to get the better feel for how these work inside the CPU, how segment:offset addressing is used to access memory in 16b real mode of x86, etc...

Related

How to find the hamming distance for strings that are not necessarily equal length?

I have an assignment asking me to find the hamming distance of two user-input strings that are not necessarily equal in length.
So, I made the following algorithm:
Read both strings
check the length of each string
compare the length of the strings
if(str1 is shorter)
set counter to be the length of str1
END IF
if(str1 is longer)
set counter to be the length of str2
END IF
if(str1 == str2)
set counter to be length of str1
END IF
loop through each digit of the strings
if(str1[digitNo] XOR str2[digitNo] == 1)
inc al
END IF
the final al value is the hamming distance of the strings, print it.
But I'm stuck at step 3 and I don't seem to get it working. any help?
I tried playing around with the registers to save the values in, but none of that worked, I still didn't get it working.
; THIS IS THE CODE I GOT
.model small
.data
str1 db 255
db ?
db 255 dup(?)
msg1 db 13,10,"Enter first string:$"
str2 db 255
db ?
db 255 dup(?)
msg2 db 13,10,"Enter second string:$"
one db "1"
count db ?
.code
.startup
mov ax,#data
mov ds,ax
; printing first message
mov ah, 9
mov dx, offset msg1
int 21h
; reading first string
mov ah, 10
mov dx, offset str1
int 21h
; printing second message
mov ah, 9
mov dx, offset msg2
int 21h
; reading second string
mov ah, 10
mov dx, offset str2
int 21h
; setting the values of the registers to zero
mov si, 0
mov di, 0
mov cx, 0
mov bx, 0
; checking the length of the first string
mov bl, str1+1
add bl, 30h
mov ah, 02h
mov dl, bl
int 21h
; checking the length of the second string
mov bl, str2+1
add bl, 30h
mov ah, 02h
mov dh, bl
int 21h
; comparing the length of the strings
cmp dl,dh
je equal
jg str1Greater
jl str1NotGreater
; if the strings are equal we jump here
equal:
mov cl, dl
call theLoop
; if the first string is greater than the second, we jump here and set counter of str1
str1Greater:
; if the second string is greater than the first, we jump here and set counter to length of str2
Str1NotGreater:
; this is the loop that finds and prints the hamming distance
;we find it by looping over the strings and taking the xor for each 2, then incrementing counter of ones for each xor == 1
theLoop:
end
So, in the code I provided, it's supposed to print the length of each string (it prints the lengths next to each other), but it seems to always keep printing the length of the first string, twice. The register used to store the length of the first string is dl, and the register used to store the length of the second is dh, if I change it back to dl, it would then print the correct length, but I want to compare the lengths, and I think it won't be possible to do so if I save it in dl both times.
but it seems to always keep printing the length of the first string, twice.
When outputting a character with the DOS function 02h you don't get to choose which register to use to supply the character! It's always DL.
Since after printing both lengths you still want to work with these lengths it will be better to not destroy them in the first place. Put the 1st length in BL and the second length in BH. For outputting you copy these in turn to DL where you do the conversion to a character. This of course can only work for strings of at most 9 characters.
; checking the length of the first string
mov BL, str1+1
mov dl, BL
add dl, 30h
mov ah, 02h
int 21h
; checking the length of the second string
mov BH, str2+1
mov dl, BH
add dl, 30h
mov ah, 02h
int 21h
; comparing the length of the strings
cmp BL, BH
ja str1LONGER
jb str1SHORTER
; if the strings are equal we ** FALL THROUGH ** here
equal:
mov cl, BL
mov ch, 0
call theLoop
!!!! You need some way out at this point. Don't fall through here !!!!
; if the first string is greater than the second, we set counter of str1
str1LONGER:
; if the second string is greater than the first, we set counter to length of str2
Str1SHORTER:
; this is the loop that finds and prints the hamming distance
;we find it by looping over the strings and taking the xor for each 2, then incrementing counter of ones for each xor == 1
theLoop:
Additional notes
Lengths are unsigned numbers. Use the unsigned conditions above and below.
Talking about longer and shorter makes more sense for strings.
Don't use 3 jumps if a mere fall through in the code can do the job.
Your code in theLoop will probably use CX as a counter. Don't forget to zero CH. Either using 2 instructions like I did above or else use movzx cx, BL if you're allowed to use instructions that surpass the original 8086.
Bonus
mov si, offset str1+2
mov di, offset str2+2
mov al, 0
MORE:
mov dl, [si]
cmp dl, [di]
je EQ
inc al
EQ:
inc si
inc di
loop MORE

How do i reverse a string on emu8086 assembly language [duplicate]

I have to do a simple calculator in assembly using EMU8086, but every time I try to launch it EMU8086 gives this error:
INT 21h, AH=09h -
address: 170B5
byte 24h not found after 2000 bytes.
; correct example of INT 21h/9h:
mov dx, offset msg
mov ah, 9
int 21h
ret
msg db "Hello$"
I checked the other stuff, but there were no mistakes:
data segment
choice db ?
snum1 db 4 dup(?)
snum2 db 4 dup(?)
sres db 4 dup(?)
num1 db ?
num2 db ?
res db ?
;;menu1 db "Chose a function to procced", 10, 13, "Add [+]", 10, 13, "Sub [-]", 10, 13
;;menu2 db "Mul [*]", 10, 13, "Div [/]", 10, 13, "Mod [%]", 10, 13, "Pow [^]", 10, 13, "Exit [x]$"
messStr db "Enter Your Choice:",10,13,"",10,13,"Add --> +",10,13,"Sub --> -",10,13,"Mul --> *",10,13,"Div --> /",10,13,"Mod --> %",10,13,"Pow --> ^",10,13,"Exit --> X",10,13,"$"
msg1 db "Enter first number$"
msg2 db "Enter second number$"
msg3 db "Press any key to procced$"
msg4 db "The result is $"
ends
stack segment
dw 128 dup(0)
ends
code segment
assume cs:code, ds:data, ss:stack
newline proc ;; new line
push ax
push dx
mov ah, 2
mov DL, 10
int 21h
mov ah, 2
mov DL, 13
int 21h
pop dx
pop ax
ret
endp
printstr proc ;; print string
push BP
mov BP, SP
push dx
push ax
mov dx, [BP+4]
mov ah, 9
int 21h
pop ax
pop dx
pop BP
ret 2
endp
inputstr proc ;; collect input
push BP
mov BP, SP
push bx
push ax
mov bx, [BP+4]
k1:
mov ah, 1
int 21h
cmp al, 13
je sofk
mov [bx], al
inc bx
jmp k1
sofk:
mov byte ptr [bx], '$'
pop ax
pop bx
pop BP
ret 2
endp
getNums proc ;; get the numbers
call newline
push offset msg1
call printstr
call newline
push offset snum1
call inputstr
call newline
push offset msg2
call printstr
call newline
push offset snum2
call inputstr
ret
endp
start:
mov ax, data
mov ds, ax
mov ax, stack
mov ss, ax
;; print the main menu
call newline
push offset msg4
call printstr
;; collect the input
call newline
mov bx, offset choice
mov ah, 1
int 21h
mov [bx], al
;; check it
mov al, choice
cmp al, '+'
jne cexit
call getNums
jmp cont
cexit:
cmp al, 'x'
je cend
cont:
;; pause before going to the main menu
call newline
push offset msg3
call printstr
mov bx, offset choice
mov ah, 1
int 21h
call newline
call newline
call newline
jmp start
cend:
mov ax, 4c00h
int 21h
ends
end start
I cut most of the code segment because it wasn't important here.
After experimenting with the code I found that the problem was related to the lengths of the messages in the data segment. menu1 & menu2 were too long and any message after them can't be printed (msg1 & msg2 are printed, but nothing after them). I checked if I should merge menu1 & menu2, but it didn't help out. Please help me find out what is wrong with it.
The error message means you use int 21h / AH=09h on a string that didn't end with a $ (ASCII 24h). The system-call handler checked 2000 bytes without finding one.
Often, that means your code or data is buggy, e.g. in a fixed string you forgot a $ at the end, or if copying bytes into a buffer then you maybe overwrote or never stored a '$' in the first place.
But in this case, it appears that EMU8086 has a bug assembling push offset msg4. (In a way that truncates the 00B5h 16-bit address to 8-bit, and sign-extends back to 16, creating a wrong pointer that points past where any $ characters are in your data.)
Based on the error message below I know you are using EMU8086 as your development environment.
INT 21h, AH=09h -
address: 170B5
byte 24h not found after 2000 bytes.
; correct example of INT 21h/9h:
mov dx, offset msg
mov ah, 9
int 21h
ret
msg db "Hello$"
I'm no expert on EMU8086 by any stretch of the imagination. I do know why your offsets don't work. I can't tell you if there is a proper way to resolve this, or if it's an EMU8086 bug. Someone with a better background on this emulator would know.
You have created a data segment with some variables. It seems okay to me (but I may be missing something). I decided to load up EMU8086 to actually try this code. It assembled without error. Using the debugger I single stepped to the push offset msg1 line near the beginning of the program. I knew right away from the instruction encoding what was going on. This is the decoded instruction I saw:
It shows the instruction was encoded as push 0b5h where 0b5h is the offset. The trouble is that it is encoded as a push imm8 . The two highlighted bytes on the left hand pane show it was encoded with these bytes:
6A B5
If you review an instruction set reference you'll find the encodings for PUSH instruction encoded with 6A is listed as:
Opcode* Instruction Op/En 64-Bit Mode Compat/Leg Mode Description
6A ib PUSH imm8 I Valid Valid Push imm8.
You may say that B5 fits within a byte (imm8) so what is the problem? The smallest value that can be pushed onto the stack with push in 16-bit mode is a 16-bit word. Since a byte is smaller than a word, the processor takes the byte and sign extends it to make a 16-bit value. The instruction set reference actually says this:
If the source operand is an immediate of size less than the operand size, a sign-extended value is pushed on the stack
B5 is binary 10110101 . The sign bit is the left most bit. Since it is 1 the upper 8 bits placed onto the stack will be 11111111b (FF). If the sign bit is 0 then then 00000000b is placed in the upper 8 bits. The emulator didn't place 00B5 onto the stack, it placed FFB5. That is incorrect! This can be confirmed if I step through the push 0b5h instruction and review the stack. This is what I saw:
Observe that the value placed on the stack is FFB5. I could not find an appropriate syntax (even using the word modifier) to force EMU8086 to encode this as push imm16. A push imm16 would be able to encode the entire word as push 00b5 which would work.
Two things you can do. You can place 256 bytes of dummy data in your data segment like this:
data segment
db 256 dup(?)
choice db ?
... rest of data
Why does this work? Every variable defined after the dummy data will be an offset that can't be represented in a single byte. Because of this EMU8086 is forced to encode push offset msg1 as a word push.
The cleaner solution is to use the LEA instruction. This is the load effective address instruction. It takes a memory operand and computes the address (in this case the offset relative to the data segment). You can replace all your code that uses offset with something like:
lea ax, [msg1]
push ax
AX can be any of the general purpose 16-bit registers. Once in a register, push the 16-bit register onto the stack.
Someone may have a better solution for this, or know a way to resolve this. If so please feel free to comment.
Given the information above, you may ask why did it seem to work when you moved the data around? The reason is that the way you reorganized all the strings (placing the long one last) caused all the variables to start with offsets that were less than < 128. Because of this the PUSH of an 8-bit immediate offset sign extended a 0 in the top bits when placed on the stack. The offsets would be correct. Once the offsets are >= 128 (and < 256) the sign bit is 1 and the value placed on the stack sign will have an upper 8 bits of 1 rather than 0.
There are other bugs in your program, I'm concentrating on the issue directly related to the error you are receiving.
I reviewed your code and concentrated on the following sequence of instructions:
mov bx, offset choice ; here you set BX to the address of 'choice'
mov ah, 1
int 21h ; here you 'READ CHARACTER FROM STANDARD INPUT, WITH ECHO'
mov [bx], al ; because INT 21h does preserve BX, you are writing back the result of the interrupt call (AL) back to the memory location at BX, which is named 'choice'
;; check it
mov al, choice ; HERE you are moving a BYTE variable named 'choice' to AL, overwriting the result of the last INT 21h call
cmp al, '+' ; ... and compare this variable to the ASCII value of '+'
jne cexit ; if this variable is unequal to '+' you jump to 'cexit'
call getNums ; otherwise you try to get another number from the input/STANDARD CONSOLE
So your sequence
mov bx, offset choice ; here you set BX to the address of 'choice'
...
mov [bx], al ; because INT 21h does preserve BX, you ...
...
mov al, choice
essentially means, that you are setting BX to the address of 'choice', then setting 'choice'([BX]) to AL and copying it back to AL.
This is redundant.
After that, you compare that char to '+' and...
if that char equals to '+', you get the next char with call getNums and then continue with cont:.
if that char does not equal to '+', you compare it to 'x', the exit-char. If it's not 'x', you fall through to cont:
No error here.
So your problem with menu1 and menu2 may stem from some escape characters included in your strings like %,/,\. For example, % is a MACRO character in some assemblers which may create problems.
simple solution is that your strings should always end in '$'
change DUP(?) to DUP('$') and all other strings end with ,'$'

how to store strings in 8086

Im using emu8086.
For example i have a macro called 'store' which takes a string and stores it in an array, how do i do that?
sample code:
arrayStr db 30 dup(' ')
store "qwerty"
store MACRO str
*some code here which stores str into arrayStr*
endm
Most examples i found on the internet revolve around already having the string stored in a variable (ex. string db (some string here)) but i want something where the variables get initialized empty first.
Do you want to change a variable at runtime? In this case take a look at the PRINT-macro in emu8086.inc. A few changes and you've got a STORE-macro:
store MACRO str
LOCAL skip_data, endloop, repeat, localdata
jmp skip_data ; Jump over data
localdata db str, '$', 0 ; Store the macro-argument with terminators
skip_data:
mov si, OFFSET localdata
mov di, OFFSET msg
repeat: ; Loop to store the string
cmp byte ptr [si], 0 ; End of string?
je endloop ; Yes: end of loop
movsb ; No: Copy one byte from DS:SI to ES:DI, inc SI & DI
jmp repeat ; Once more
endloop:
ENDM
crlf MACRO
LOCAL skip_data, localdata
jmp skip_data
localdata db 13, 10, '$'
skip_data:
mov dx, offset localdata
mov ah, 09h
int 21h
ENDM
ORG 100h
mov dx, OFFSET msg
mov ah, 09h
int 21h
crlf
store "Hello!"
mov dx, OFFSET msg
mov ah, 09h
int 21h
crlf
store "Good Bye."
mov dx, OFFSET msg
mov ah, 09h
int 21h
mov ax, 4C00h
int 21h
msg db "Hello, World!", '$'
It depends on, what you want to do with the string
Here are some examples:
ASCIZ-String
The string ends with a zero-byte.
The advantage is that everytime the CPU loads a single byte from the RAM the zero-flag is set if the end of the string is reached.
The disadvantage is that the string mustn't contain another zero-byte. Otherwise the program would interprete an earlier zero-byte as the end of the string.
String input from DOS-Function Readln (int 21h/ ah=0ah)
The first byte defines, how long the string inputted by the user could be maximally. The effective length is defined in the second byte. The rest contains the string.
String which is ready to be outputted using WriteLn (int 21h/ ah=09h)
The string ends with a dollar-sign (ASCII 36).
The advantage is that your programm can output the string using a single function (int 21h/ ah=09h).
The disadvantage is that the string mustn't contain another dollar-sign. Otherwise the program would interprete an earlier dollar-sign as the end of the string.
String whose length is defined in a word/byte at the beginning of the String
Unformatted String
You don't have to save the length in a variable nor marking the end, if you save the length to a constant which you can put in a register (e.g. in CX)

How to compare two strings in assembly?

I'm new in assembly. I want to compare two string using "cmps". I read some examples and I write this :
GETSTR MACRO STR
MOV AH,0AH
LEA DX,STR
INT 21H
ENDM
PRINTSTR MACRO STR
MOV AH,09H
LEA DX,STR
INT 21H
ENDM
EXTRA SEGMENT
DEST DB ?
EXTRA ENDS
DATA SEGMENT
SOURCE DB ?
STR1 DB 0AH,0DH,'ENTER STR : ' ,'$'
ENTER DB 10,13,'$'
SAME DB 0AH,0DH,'TWO STR ARE THE SAME ' ,'$'
NSAME DB 0AH,0DH,'TWO STR ARE NOT THE SAME ' ,'$'
USER DB 6,10 DUP('$')
USER1 DB 6,10 DUP('$')
DATA ENDS
CODE SEGMENT
ASSUME DS:DATA,CS:CODE,ES:EXTRA
START:
MOV AX,DATA
MOV DS,AX
MOV AX,EXTRA
MOV ES,AX
PRINTSTR STR1
GETSTR USER1
PRINTSTR STR1
GETSTR USER
LEA BX,USER
MOV SI,BX
LEA BX,USER1
MOV DI,BX
CLD
MOV CX,5
REPE CMPSB
JCXZ MTCH
PRINTSTR NSAME
JMP ENDPR
MTCH:
PRINTSTR SAME
ENDPR:
MOV AH,4CH
INT 21H
CODE ENDS
END START
I have some question:
what is exactly the numbers 6,10 in the code below :
USER DB 6,10 DUP('$')
Is there any mistake with the Macros?
Is it necessary to declare EXTRA SEGMENT ?
For any similar strings input the output is : "they are not the same?" what is the reason?
The number 6 defines the number of characters plus 1 that you want DOS to input. The number 10 defines the length of the buffer that follows. Actually the number 7 would have been enough!
The macros seem fine.
You don't need the EXTRA segment. Moreover putting it into ES is wrong because both strings that you will be comparing are in the DATA segment.
Also both LEA instructions must fetch an address that is 2 higher. The first byte will still be the maximum number of bytes to read (6) and the second byte will be the number of bytes actually read [0,5]
The comparison you're making indifferably uses 5 characters. If you don't take into account the real number of characters as reported by DOS in the second byte it's no wonder results might not be satisfying.

Manipulate string in assembly x86 (mov and print to screen)

I'm working on a larger project but I'm stuck with string manipulation. My assembly file includes math coprocessor operations (it starts the coprocessor with "FINIT") but I don't think it should be interfering at all.
Basically, I have some strings that are 50 bytes long each:
$s db 50 dup (?), '$'
_cte_14 db "hello world", '$', 39 dup (?)
I need to assign the value stored in variable "_cte_14" to variable "$s"
I attempted to use a register to temporarily store the value, like this:
mov cx, _cte_14
mov $s, cx
but I get the "operand types do not match" error.
Since I know the AX, BX, CX, DX registers only hold 16 bits, I thought maybe I need to work with the memory address of the first string character, so I tried:
mov bx, offset _cte_14
mov $s, bx
but the same error shows up.
I'm using TASM to compile for an x86 processor. What would be the right way to accomplish this?
Thanks a lot in advance.
Example for to copy the characters in a loop:
s db 51 dup ('$')
_cte_14 db "hello world"
len = ($ - _cte_14) ; (current location - offset _cte_14)
40 dup ('$')
mov si, offset _cte_14 ; get source offset
mov di, offset s ; get destination offset
mov cl, len ; length of the string
P1:
mov al, [si] ; get byte from DS:SI
mov [di], al ; store byte to DS:DI
dec cl ; decrease counter, set zero flag if zero
jnz P1 ; jump if zero flag is not set
-- Variation with using a string instruction together with a repeat instruction prefix:
mov si, offset _cte_14
mov di, offset s
mov cx, len ; length of the string
cld ; clear direction flag
rep movsb ; copy from DS:SI to ES:DI, increase SI+DI, decrease CX, repeat CX times

Resources