128-bit division intrinsic in Visual C++ - visual-c++

I'm wondering if there really is no 128-bit division intrinsic function in Visual C++?
There is a 64x64=128 bit multiplication intrinsic function called _umul128(), which nicely matches the MUL x64 assembler instruction.
Naturally, I assumed there would be a 128/64=64 bit division intrinsic as well (modelling the DIV instruction), but to my amazement neither Visual C++ nor Intel C++ seem to have it, at least it's not listed in intrin.h.
Can someone confirm that? I tried grep'ing for the function names in the compiler executable files, but couldn't find _umul128 in the first place, so I guess I looked in the wrong spot.
Update: at least I have now found the pattern umul128 (without the leading underscore) in c1.dll of Visual C++ 2010. All the other intrinsics are listed around it, but unfortunately no "udiv128" or the like :( So it seems they really have "forgotten" to implement it.
To clarify: I'm not only looking for a 128-bit data type, but a way to divide a 128-bit scalar int by a 64-bit int in C++. Either an intrinsic function or native 128-bit integer support would solve my problem.
Edit: The answer is no, there is no _udiv128 intrinsic in Visual Studio 2010 up to 2017, but it is available in Visual Studio 2019 RTM

If you don't mind little hacks, this may help (64-bit mode only, not tested):
#include <windows.h>
#include <stdio.h>
unsigned char udiv128Data[] =
{
0x48, 0x89, 0xD0, // mov rax,rdx
0x48, 0x89, 0xCA, // mov rdx,rcx
0x49, 0xF7, 0xF0, // div r8
0x49, 0x89, 0x11, // mov [r9],rdx
0xC3 // ret
};
unsigned char sdiv128Data[] =
{
0x48, 0x89, 0xD0, // mov rax,rdx
0x48, 0x89, 0xCA, // mov rdx,rcx
0x49, 0xF7, 0xF8, // idiv r8
0x49, 0x89, 0x11, // mov [r9],rdx
0xC3 // ret
};
unsigned __int64 (__fastcall *udiv128)(unsigned __int64 numhi,
unsigned __int64 numlo,
unsigned __int64 den,
unsigned __int64* rem) =
(unsigned __int64 (__fastcall *)(unsigned __int64,
unsigned __int64,
unsigned __int64,
unsigned __int64*))udiv128Data;
__int64 (__fastcall *sdiv128)(__int64 numhi,
__int64 numlo,
__int64 den,
__int64* rem) =
(__int64 (__fastcall *)(__int64,
__int64,
__int64,
__int64*))sdiv128Data;
int main(void)
{
DWORD dummy;
unsigned __int64 ur;
__int64 sr;
VirtualProtect(udiv128Data, sizeof(udiv128Data), PAGE_EXECUTE_READWRITE, &dummy);
VirtualProtect(sdiv128Data, sizeof(sdiv128Data), PAGE_EXECUTE_READWRITE, &dummy);
printf("0x00000123456789ABCDEF000000000000 / 0x0001000000000000 = 0x%llX\n",
udiv128(0x00000123456789AB, 0xCDEF000000000000, 0x0001000000000000, &ur));
printf("-6 / -2 = %lld\n",
sdiv128(-1, -6, -2, &sr));
return 0;
}

A small improvement - one less instruction
extern "C" digit64 udiv128(digit64 low, digit64 hi, digit64 divisor, digit64 *remainder);
; Arguments
; RCX Low Digit
; RDX High Digit
; R8 Divisor
; R9 *Remainder
; RAX Quotient upon return
.code
udiv128 proc
mov rax, rcx ; Put the low digit in place (hi is already there)
div r8 ; 128 bit divide rdx-rax/r8 = rdx remainder, rax quotient
mov [r9], rdx ; Save the reminder
ret ; Return the quotient
udiv128 endp
end

It's available now. You can use _div128 and _udiv128
The _div128 intrinsic divides a 128-bit integer by a 64-bit integer. The return value holds the quotient, and the intrinsic returns the remainder through a pointer parameter. _div128 is Microsoft specific.
Last year it was said to be available from "Dev16" but I'm not sure which version is that. I guess it's VS 16.0 A.K.A VS2019, but the documentation on MSDN shows that it goes further to VS2015

I am no expert, but I dug this up:
http://research.swtch.com/2008/01/division-via-multiplication.html
Interesting stuff. Hope it helps.
EDIT: This is insightful too: http://www.gamedev.net/topic/508197-x64-div-intrinsic/

Thanks #alexey-frunze, it worked with little tweak for VS2017, checked with same parameters with VS2019:
#include <iostream>
#include <string.h>
#include <math.h>
#include <immintrin.h>
#define no_init_all
#include <windows.h>
unsigned char udiv128Data[] =
{
0x48, 0x89, 0xD0, // mov rax,rdx
0x48, 0x89, 0xCA, // mov rdx,rcx
0x49, 0xF7, 0xF0, // div r8
0x49, 0x89, 0x11, // mov [r9],rdx
0xC3 // ret
};
unsigned char sdiv128Data[] =
{
0x48, 0x89, 0xD0, // mov rax,rdx
0x48, 0x89, 0xCA, // mov rdx,rcx
0x49, 0xF7, 0xF8, // idiv r8
0x49, 0x89, 0x11, // mov [r9],rdx
0xC3 // ret
};
unsigned __int64(__fastcall* udiv128)(
unsigned __int64 numhi,
unsigned __int64 numlo,
unsigned __int64 den,
unsigned __int64* rem) =
(unsigned __int64(__fastcall*)(
unsigned __int64,
unsigned __int64,
unsigned __int64,
unsigned __int64*))
((unsigned __int64*)udiv128Data);
__int64(__fastcall *sdiv128)(
__int64 numhi,
__int64 numlo,
__int64 den,
__int64* rem) =
(__int64(__fastcall *)(
__int64,
__int64,
__int64,
__int64*))
((__int64*)sdiv128Data);
void test1()
{
unsigned __int64 a = 0x3c95ba9e6a637e7;
unsigned __int64 b = 0x37e739d13a6d036;
unsigned __int64 c = 0xa6d036507ecc7a7;
unsigned __int64 d = 0x7ecc37a70c26e68;
unsigned __int64 e = 0x6e68ac7e5f15726;
DWORD dummy;
VirtualProtect(udiv128Data, sizeof(udiv128Data), PAGE_EXECUTE_READWRITE, &dummy);
e = udiv128(a, b, c, &d);
printf("d = %llx, e = %llx\n", d, e); // d = 1ed37bdf861c50, e = 5cf9ffa49b0ec9aa
}
void test2()
{
__int64 a = 0x3c95ba9e6a637e7;
__int64 b = 0x37e739d13a6d036;
__int64 c = 0xa6d036507ecc7a7;
__int64 d = 0x7ecc37a70c26e68;
__int64 e = 0x6e68ac7e5f15726;
DWORD dummy;
VirtualProtect(sdiv128Data, sizeof(sdiv128Data), PAGE_EXECUTE_READWRITE, &dummy);
e = sdiv128(a, b, c, &d);
printf("d = %llx, e = %llx\n", d, e); // d = 1ed37bdf861c50, e = 5cf9ffa49b0ec9aa
}
int main()
{
test1();
test2();
return 0;
}

Related

symbol without any name and completed.7392 symbol in .bss section

In my sample C program, compiled with gcc, .bss section has an index [24], as shown by readelf -S.
When I try to see the things stored in .bss, by running
readelf -s ./pointer | grep 24, I get
Num: Value Size Type Bind Vis Ndx Name
24: 00000000000040a0 0 SECTION LOCAL DEFAULT 24
31: 00000000000040a8 1 OBJECT LOCAL DEFAULT 24 completed.7392
54: 00000000000040b0 8 OBJECT GLOBAL DEFAULT 24 label
68: 00000000000040c0 0 NOTYPE GLOBAL DEFAULT 24 _end
70: 00000000000040b8 4 OBJECT GLOBAL DEFAULT 24 i
71: 0000000000004090 0 NOTYPE GLOBAL DEFAULT 24 __bss_start
79: 00000000000040ac 4 OBJECT GLOBAL DEFAULT 24 err
81: 00000000000040a0 8 OBJECT GLOBAL DEFAULT 24 stderr##GLIBC_2.2.5
size ./pointer gives me
text data bss dec hex filename
3196 680 32 3908 f44 ./pointer
what's the symbol without any name and symbol with name completed.7392?
and, why size doesn't add up to 32 bytes, as shown by size? [ it is 25 now ]
As a side question, where are stdin and stdout symbols? I can find only stderr, and that is in the bss section.
program source attached below. compiled with gcc version 9.2
/*
* A program that will read and print printable characters in it's memory given a memory address
* until it segfaults
*/
#define _GNU_SOURCE /* Bring REG_XXX names from /usr/include/sys/ucontext.h */
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <ucontext.h>
int i,err=0;
void sighandler(int signum);
void* label;
void readmem(){
long loc;
i=0;
err=0;
printf("enter mem location:");
scanf("%lx",&loc);
printf("--dump begin--\n");
char * addr = (void *) loc;
char ch;
label=&&l;
/* was kept there so that I can find how many bytes to increment rip to recover fro segfault
address can be found using gdb also
*/
while(1){
ch=addr[i];
l:
printf("%c",ch);
i++;
if ( err == 1)
break;
// printf("%d\n",i);
}
}
static void sigaction_segv(int signal, siginfo_t *si, void *arg)
{
ucontext_t *ctx = (ucontext_t *)arg;
/* We are on linux x86, the returning IP is stored in RIP (64bit) or EIP (32bit).
In this example, the length of the offending instruction is 6 bytes.
So we skip the offender !
&&l will return address pointed by label l
(gdb) disass readmem
...
0x0000555555555284 <+139>: mov -0x10(%rbp),%rax
0x0000555555555288 <+143>: add %rdx,%rax
=> 0x000055555555528b <+146>: movzbl (%rax),%eax -> rip on segfault
0x000055555555528e <+149>: mov %al,-0x19(%rbp)
0x0000555555555291 <+152>: movsbl -0x19(%rbp),%eax
0x0000555555555295 <+156>: mov %eax,%edi
0x0000555555555297 <+158>: callq 0x555555555030 <putchar#plt>
...
>>
(gdb) p (void*) label
$1 = (void *) 0x555555555291 <readmem+152> ->address of next instruction
>>
we need to go to <readmem+152> ie, next instruction
so we add decimal 6 to rip in sighandler
*/
#if __WORDSIZE == 64
printf("\nCaught SIGSEGV, addr %p, RIP 0x%lx\n",si->si_addr,ctx->uc_mcontext.gregs[REG_RIP]);
ctx->uc_mcontext.gregs[REG_RIP] += 6;
#else
printf("Caught SIGSEGV, addr %p, EIP 0x%x\n",si->si_addr,ctx->uc_mcontext.gregs[REG_EIP]);
ctx->uc_mcontext.gregs[REG_EIP] += 6;
#endif
err=1;
printf("no of bytes read:%d\n",i);
}
int main () {
// 0x0 is hex literal that defaults to signed integer
// here we are casting it to a void pointer
// and then assigning it to a value declared to be a void pointer
// this is the correct way to create an arbitrary pointer in C
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sigemptyset(&sa.sa_mask);
sa.sa_sigaction = sigaction_segv;
sa.sa_flags = SA_SIGINFO;
if (sigaction(SIGSEGV, &sa, NULL) == -1) {
fprintf(stderr, "failed to setup SIGSEGV handler\n");
exit(1);
}
char c[25];
sprintf(c,"cat /proc/%d/maps",getpid());
system(c);
while(1){
readmem();
}
}

VMX performance issue with rdtsc (no rdtsc exiting, using rdtsc offseting)

I am working a Linux kernel module (VMM) to test Intel VMX, to run a self-made VM (The VM starts in real-mode, then switches to 32bit protected mode with Paging enabled).
The VMM is configured to NOT use rdtsc exit, and use rdtsc offsetting.
Then, the VM runs rdtsc to check the performance, like below.
static void cpuid(uint32_t code, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) {
__asm__ volatile(
"cpuid"
:"=a"(*eax),"=b"(*ebx),"=c"(*ecx), "=d"(*edx)
:"a"(code)
:"cc");
}
uint64_t rdtsc(void)
{
uint32_t lo, hi;
// RDTSC copies contents of 64-bit TSC into EDX:EAX
asm volatile("rdtsc" : "=a" (lo), "=d" (hi));
return (uint64_t)hi << 32 | lo;
}
void i386mode_tests(void)
{
u32 eax, ebx, ecx, edx;
u32 i = 0;
asm ("mov %%cr0, %%eax\n"
"mov %%eax, %0 \n" : "=m" (eax) : :);
my_printf("Guest CR0 = 0x%x\n", eax);
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
vm_tsc[0]= rdtsc();
for (i = 0; i < 100; i ++) {
rdtsc();
}
vm_tsc[1]= rdtsc();
my_printf("Rdtsc takes %d\n", vm_tsc[1] - vm_tsc[0]);
}
The output is something like this,
Guest CR0 = 0x80050033
Rdtsc takes 2742
On the other hand, I make a host application to do the same thing, like above
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
static void cpuid(uint32_t code, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) {
__asm__ volatile(
"cpuid"
:"=a"(*eax),"=b"(*ebx),"=c"(*ecx), "=d"(*edx)
:"a"(code)
:"cc");
}
uint64_t rdtsc(void)
{
uint32_t lo, hi;
// RDTSC copies contents of 64-bit TSC into EDX:EAX
asm volatile("rdtsc" : "=a" (lo), "=d" (hi));
return (uint64_t)hi << 32 | lo;
}
int main(int argc, char **argv)
{
uint64_t vm_tsc[2];
uint32_t eax, ebx, ecx, edx, i;
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
vm_tsc[0]= rdtsc();
for (i = 0; i < 100; i ++) {
rdtsc();
}
vm_tsc[1]= rdtsc();
printf("Rdtsc takes %ld\n", vm_tsc[1] - vm_tsc[0]);
return 0;
}
It outputs followings,
Rdtsc takes 2325
Running above two codes in 40 iterations to get the average value as followings,
avag(VM) = 3188.000000
avag(host) = 2331.000000
The performance difference can NOT be ignored, when running the codes in VM and in host. It is NOT expected.
My understanding is, using TSC offsetting + no RDTSC exit, there should be little difference in rdtsc, running in VM and host.
Here are VMCS fields,
0xA501E97E = control_VMX_cpu_based
0xFFFFFFFFFFFFFFF0 = control_CR0_mask
0x0000000080050033 = control_CR0_shadow
In the last level of EPT PTEs, bit[5:3] = 6 (Write Back), bit[6] = 1. EPTP[2:0] = 6 (Write Back)
I tested in bare-metal, and in VMware, I got the similar results.
I am wondering if there is anything I missed in this case.

why the lock field of Linux kernel's rwlock_t is unsigned int

In Linux 2.6.11.12, the rwlock_t is defined like this:
typedef struct {
volatile unsigned int lock;
#ifdef CONFIG_DEBUG_SPINLOCK
unsigned magic;
#endif
#ifdef CONFIG_PREEMPT
unsigned int break_lock;
#endif
} rwlock_t;
In the definition of rwlock_t, the lock field is unsigned int. When we want to get a read lock, read_lock() will eventually call the _raw_read_trylock()
static inline int _raw_read_trylock(rwlock_t *lock)
{
atomic_t *count = (atomic_t *)lock;
atomic_dec(count);
if (atomic_read(count) >= 0)
return 1;
atomic_inc(count);
return 0;
}
In this function, we call atomic_dec() to decrease the lock and check whether it is greater than or equal to zero. But since the lock is an unsigned int, lock will always greater than 0! That is, this function will always return 1.
I guessed that the atomic_read() convert the result to a int, but in i386 arch, it is defined as
#define atomic_read(v) ((v)->counter)
and I have no idea how unsigned int lock works.

Implementing rint() in x86-64

MSVC 2012 doesn't have the rint() function. For 32-bit, I'm using the following:
double rint(double x) {
__asm {
fld x
frndint
}
}
This doesn't work in x64. There's _mm_round_sd() but that requires SSE4. What is an efficient preferrably branchless way of getting the same behavior?
rint 64-bit mode
#include <emmintrin.h>
static inline double rint (double const x) {
return (double)_mm_cvtsd_si32(_mm_load_sd(&x));
}
See Agner Fog's Optimizing C++ manual for lrint
32-bit mode
// Example 14.19
static inline int lrint (double const x) { // Round to nearest integer
int n;
#if defined(__unix__) || defined(__GNUC__)
// 32-bit Linux, Gnu/AT&T syntax:
__asm ("fldl %1 \n fistpl %0 " : "=m"(n) : "m"(x) : "memory" );
#else
// 32-bit Windows, Intel/MASM syntax:
__asm fld qword ptr x;
__asm fistp dword ptr n;
#endif
return n;
}
64-bit mode
// Example 14.21. // Only for SSE2 or x64
#include <emmintrin.h>
static inline int lrint (double const x) {
return _mm_cvtsd_si32(_mm_load_sd(&x));
}
Edit:
I just realized that this method will limit the values to to +/- 2^31. If you want a version with a larger range with SSE2 it's complicated (but easy with SSE4.1). See the round function in Agner Fog's Vector Class in the file vectorf128.h for an example.

Linux sys_call_table rip relative addressing x86_64

I am trying to get offset of sys_call_table on Linux x86_64.
First of all I read pointer to system_call entry by reading it from MSR_LSTAR and it's correct
static unsigned long read_msr(unsigned int msr)
{
unsigned low, high;
asm volatile("rdmsr" : "=a" (low), "=d" (high) : "c" (msr));
return ((low) | ((u64)(high) << 32));
}
Then I parse it to find opcode of call instruction and it is also correct
#define CALL_OP 0xFF
#define CALL_MODRM 0x14
static unsigned long find_syscall_table(unsigned char *ptr)
{
//correct
for (; (*ptr != CALL_OP) || (*(ptr+1) != CALL_MODRM); ptr++);
//not correct
ptr += *(unsigned int*)(ptr + 3);
pr_info("%lx", (unsigned long)ptr);
return ptr;
}
But I failed to get address after call opcode. First byte of ptr is opcode, then ModRM byte, then SIB and then 32bit displacement, so I add 3 to ptr and dereferenced it as integer value and then add it to ptr, because it is %RIP, and address is RIP relative. But the result value is wrong, it don't coincide with value I see in gdb, so where am I wrong?
It's not x7e9fed00 but rather -0x7e9fed00 - a negative displacement.
That is the sign-magnitude form of the 2's complement negative number 0x81601300
which is stored by a little-endian processor as "00 13 60 81"
No idea if you will find sys_call_table at the resulting address however. As an alternative idea, it seems some people find it by searching memory for the known pointers to functions that should be listed in it.

Resources