I'm trying to use CUDA with objects, this is a little test code i put together to try out things, but i ran into a problem. When i'm doing anything to the device version of the variable, the copy back to the host fails with "cuda Error Ilegal Address", but if i just copy the code to the device and back it works.
If i comment out the printf... line, it the works.
class A {
public:
int s;
};
__device__ A *d_a;
__global__ void MethodA() {
printf("%d\n", d_a->s);
}
int main() {
A *a = new A();
a->s = 10;
cudaError e;
e = cudaMalloc((void**)&d_a, sizeof(A));
e = cudaMemcpy(d_a, a, sizeof(A), cudaMemcpyHostToDevice);
MethodA << <1, 1 >> > ();
e = cudaMemcpy(a, d_a, sizeof(A), cudaMemcpyDeviceToHost);
std::cout << cudaGetErrorName(e) << std::endl;
delete(a);
std::getchar();
return 0;
}
Use of the __device__ variable is causing difficulty. It is intended to be used for static allocations, known at compile time.
Your methodology would be simplified if you used an ordinary host-based pointer, pointing to a dynamic allocation created at runtime (which you are doing anyway), and then pass that host-based pointer to the device, via a kernel parameter.
Some problems with your approach:
You are using an incorrect API for modifying a __device__ variable. We don't use cudaMemcpy. We use cudaMemcpyToSymbol, etc.
You are not allowed to take the address of a device entity in host code:
e = cudaMalloc((void**)&d_a, sizeof(A));
^
cudaMalloc expects to store the allocated pointer value in host memory, not in device memory. It will point to a location in device memory, but it should be stored in a host variable.
If you want to stay with your method, the following modifications should make it correct:
$ cat t89.cu
#include <iostream>
#include <stdio.h>
class A {
public:
int s;
};
__device__ A *d_a;
__global__ void MethodA() {
printf("%d\n", d_a->s);
}
int main() {
A *a = new A();
a->s = 10;
A *temp_d_a;
cudaMalloc((void**)&temp_d_a, sizeof(A));
cudaMemcpy(temp_d_a, a, sizeof(A), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_a, &temp_d_a, sizeof(A *));
MethodA << <1, 1 >> > ();
cudaMemcpy(a, temp_d_a, sizeof(A), cudaMemcpyDeviceToHost);
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
cudaFree(temp_d_a);
delete(a);
return 0;
}
$ nvcc t89.cu -o t89
$ cuda-memcheck ./t89
========= CUDA-MEMCHECK
10
no error
========= ERROR SUMMARY: 0 errors
$
EDIT: Regarding my previous statement:
Your methodology would be simplified if you used an ordinary host-based pointer, pointing to a dynamic allocation created at runtime (which you are doing anyway), and then pass that host-based pointer to the device, via a kernel parameter.
and asked about in the comments below, here is a worked example showing that approach:
$ cat t89.cu
#include <iostream>
#include <stdio.h>
class A {
public:
int s;
};
__global__ void MethodA(A *a) {
printf("%d\n", a->s);
}
int main() {
A *a = new A();
a->s = 10;
A *d_a; // an ordinary host-based pointer
cudaMalloc((void**)&d_a, sizeof(A)); //dynamic allocation created at runtime
cudaMemcpy(d_a, a, sizeof(A), cudaMemcpyHostToDevice);
MethodA << <1, 1 >> > (d_a); // passed to kernel via parameter
cudaMemcpy(a, d_a, sizeof(A), cudaMemcpyDeviceToHost);
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
cudaFree(d_a);
delete(a);
return 0;
}
$ nvcc -o t89 t89.cu
$ cuda-memcheck ./t89
========= CUDA-MEMCHECK
10
no error
========= ERROR SUMMARY: 0 errors
$
Related
I am trying to map an std::unordered_map with boost interproccess.
With Windows everything works fine. On Linux i get a segmentation fault.
using namespace boost::interprocess;
struct State {
uint8_t state = 0;
State(uint8_t _state) {
state = _state;
}
};
managed_mapped_file file(open_or_create, "MySharedMemory", 65536);
typedef uint32_t KeyType;
typedef State MappedType;
typedef std::pair<const KeyType, MappedType> ValueType;
typedef boost::interprocess::allocator<ValueType, boost::interprocess::managed_mapped_file::segment_manager> UnorderedMapAllocator;
typedef boost::unordered_map<KeyType, MappedType, boost::hash<KeyType>, std::equal_to<KeyType>, UnorderedMapAllocator> unordered_map;
unordered_map* map = file.find_or_construct<unordered_map>("fault_states")(3, boost::hash<KeyType>(), std::equal_to<KeyType>(), file.get_segment_manager());
uint32_t size = (uint32_t)map->size();
for (uint32_t i = size; i < size + 100; ++i) {
State state(46);
map->insert(ValueType(i, state));
}
for (auto x : *map) {
std::cout << x.first << " ";
std::cout << x.second.state << std::endl;
}
On the first run everything works fine. On the second run i get the segmentation fault.
If i use boost::unordered_map instead everything works fine, but i have to use the std version
Boost Version 1.6.4
Gcc Version 7.4.0
class test
{
void thread1()
{
int i = 0;
while(true){
for(unsigned int k = 0;k < mLD.size(); k++ )
{
mLD[k] = i++;
}
}
}
void thread2()
{
std::cout << "thread2 address : " << &mLD << "\n";
C();
}
void B()
{
std::cout << "B address : " << &mLD << "\n";
for(unsigned int k = 0;k < mLD.size(); k++ )
{
if(mLD[k]<=25)
{
}
}
}
void C()
{
B();
std::cout << "C address : " << &mLD << "\n";
double distance = mLD[0]; // <---- segmetation fault
}
std::array<double, 360> mLD;
};
cout result --->
thread2 address : 0x7e807660
B address : 0x7e807660
C address : 0x1010160 (sometimes 0x7e807660 )
Why mLD's address changed ....?
even i change std::array to std::array<std::atomic<double>360>, the result is the same.
Most probably, the object you referred is destroyed at the point of call to C, which points to a synchronization issue. You need to extend the lifetime of the object referred by thread(s), until the threads done executing their routine. To accomplish this, you can have something like this;
#include <thread>
#include <array>
#include <iostream>
struct foo{
void callback1(){
for(auto & elem: storage){
elem += 5;
}
}
void callback2(){
for(const auto & elem: storage){
std::cout << elem << std::endl;
}
}
std::array<double, 300> storage;
};
int main(void){
foo f;
std::thread t1 {[&f](){f.callback1();}};
std::thread t2 {[&f](){f.callback2();}};
// wait until both threads are done executing their routines
t1.join();
t2.join();
return 0;
}
The instance of foo, f lives in scope of main() function, so its' lifetime is defined by from the line it defined to end of the main's scope. By joining both threads, we block main from proceeding further until both threads are done executing their callback functions, hence the lifetime of f extended until callbacks are done.
The second issue is, the code needs synchronization primitives, because storage variable is shared between two independent execution paths. The final code with proper synchronization can look like this;
#include <thread>
#include <array>
#include <iostream>
#include <mutex>
struct foo{
void callback1(){
// RAII style lock, which invokes .lock() upon construction, and .unlock() upon destruction
// automatically.
std::unique_lock<std::mutex> lock(mtx);
for(auto & elem: storage){
elem += 5;
}
}
void callback2(){
std::unique_lock<std::mutex> lock(mtx);
for(const auto & elem: storage){
std::cout << elem << std::endl;
}
}
std::array<double, 300> storage;
// non-reentrant mutex
mutable std::mutex mtx;
};
int main(void){
foo f;
std::thread t1 {[&f](){f.callback1();}};
std::thread t2 {[&f](){f.callback2();}};
// wait until both threads are done executing their routines
t1.join();
t2.join();
return 0;
}
I am trying to construct a std::thread with a member function that takes no arguments and returns void. I can't figure out any syntax that works - the compiler complains no matter what. What is the correct way to implement spawn() so that it returns a std::thread that executes test()?
#include <thread>
class blub {
void test() {
}
public:
std::thread spawn() {
return { test };
}
};
#include <thread>
#include <iostream>
class bar {
public:
void foo() {
std::cout << "hello from member function" << std::endl;
}
};
int main()
{
std::thread t(&bar::foo, bar());
t.join();
}
EDIT:
Accounting your edit, you have to do it like this:
std::thread spawn() {
return std::thread(&blub::test, this);
}
UPDATE: I want to explain some more points, some of them have also been discussed in the comments.
The syntax described above is defined in terms of the INVOKE definition (ยง20.8.2.1):
Define INVOKE (f, t1, t2, ..., tN) as follows:
(t1.*f)(t2, ..., tN) when f is a pointer to a member function of a class T and t1 is an object of type T or a reference to an object of
type T or a reference to an object of a type derived from T;
((*t1).*f)(t2, ..., tN) when f is a pointer to a member function of a class T and t1 is not one of the types described in the previous
item;
t1.*f when N == 1 and f is a pointer to member data of a class T and t 1 is an object of type T or a
reference to an object of type T or a reference to an object of a
type derived from T;
(*t1).*f when N == 1 and f is a pointer to member data of a class T and t 1 is not one of the types described in the previous item;
f(t1, t2, ..., tN) in all other cases.
Another general fact which I want to point out is that by default the thread constructor will copy all arguments passed to it. The reason for this is that the arguments may need to outlive the calling thread, copying the arguments guarantees that. Instead, if you want to really pass a reference, you can use a std::reference_wrapper created by std::ref.
std::thread (foo, std::ref(arg1));
By doing this, you are promising that you will take care of guaranteeing that the arguments will still exist when the thread operates on them.
Note that all the things mentioned above can also be applied to std::async and std::bind.
Since you are using C++11, lambda-expression is a nice&clean solution.
class blub {
void test() {}
public:
std::thread spawn() {
return std::thread( [this] { this->test(); } );
}
};
since this-> can be omitted, it could be shorten to:
std::thread( [this] { test(); } )
or just (deprecated)
std::thread( [=] { test(); } )
Here is a complete example
#include <thread>
#include <iostream>
class Wrapper {
public:
void member1() {
std::cout << "i am member1" << std::endl;
}
void member2(const char *arg1, unsigned arg2) {
std::cout << "i am member2 and my first arg is (" << arg1 << ") and second arg is (" << arg2 << ")" << std::endl;
}
std::thread member1Thread() {
return std::thread([=] { member1(); });
}
std::thread member2Thread(const char *arg1, unsigned arg2) {
return std::thread([=] { member2(arg1, arg2); });
}
};
int main(int argc, char **argv) {
Wrapper *w = new Wrapper();
std::thread tw1 = w->member1Thread();
std::thread tw2 = w->member2Thread("hello", 100);
tw1.join();
tw2.join();
return 0;
}
Compiling with g++ produces the following result
g++ -Wall -std=c++11 hello.cc -o hello -pthread
i am member1
i am member2 and my first arg is (hello) and second arg is (100)
#hop5 and #RnMss suggested to use C++11 lambdas, but if you deal with pointers, you can use them directly:
#include <thread>
#include <iostream>
class CFoo {
public:
int m_i = 0;
void bar() {
++m_i;
}
};
int main() {
CFoo foo;
std::thread t1(&CFoo::bar, &foo);
t1.join();
std::thread t2(&CFoo::bar, &foo);
t2.join();
std::cout << foo.m_i << std::endl;
return 0;
}
outputs
2
Rewritten sample from this answer would be then:
#include <thread>
#include <iostream>
class Wrapper {
public:
void member1() {
std::cout << "i am member1" << std::endl;
}
void member2(const char *arg1, unsigned arg2) {
std::cout << "i am member2 and my first arg is (" << arg1 << ") and second arg is (" << arg2 << ")" << std::endl;
}
std::thread member1Thread() {
return std::thread(&Wrapper::member1, this);
}
std::thread member2Thread(const char *arg1, unsigned arg2) {
return std::thread(&Wrapper::member2, this, arg1, arg2);
}
};
int main() {
Wrapper *w = new Wrapper();
std::thread tw1 = w->member1Thread();
tw1.join();
std::thread tw2 = w->member2Thread("hello", 100);
tw2.join();
return 0;
}
Some users have already given their answer and explained it very well.
I would like to add few more things related to thread.
How to work with functor and thread.
Please refer to below example.
The thread will make its own copy of the object while passing the object.
#include<thread>
#include<Windows.h>
#include<iostream>
using namespace std;
class CB
{
public:
CB()
{
cout << "this=" << this << endl;
}
void operator()();
};
void CB::operator()()
{
cout << "this=" << this << endl;
for (int i = 0; i < 5; i++)
{
cout << "CB()=" << i << endl;
Sleep(1000);
}
}
void main()
{
CB obj; // please note the address of obj.
thread t(obj); // here obj will be passed by value
//i.e. thread will make it own local copy of it.
// we can confirm it by matching the address of
//object printed in the constructor
// and address of the obj printed in the function
t.join();
}
Another way of achieving the same thing is like:
void main()
{
thread t((CB()));
t.join();
}
But if you want to pass the object by reference then use the below syntax:
void main()
{
CB obj;
//thread t(obj);
thread t(std::ref(obj));
t.join();
}
I am trying to get the code example from there to work:
https://solarianprogrammer.com/2012/10/17/cpp-11-async-tutorial/
int twice(int m){
return 2*m;
}
int main(){
std::vector< std::future<int> > futures;
for(int i=0;i<10;++i){
futures.push_back(std::async(twice,i));
}
for(auto &e:futures){
std::cout << e.get() << std::endl;
}
return 0;
}
This code results in:
terminate called after throwing an instance of 'std::system_error'
what(): Unknown error -1
I am using these flags for compilation:
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread")
The code below results in the same error (we just instantiate some minimal and unused object):
int twice(int m){
return 2*m;
}
class Foo {
public:
Foo();
};
Foo::Foo(){}
int main(){
Foo foo;
std::vector< std::future<int> > futures;
for(int i=0;i<10;++i){
futures.push_back(std::async(twice,i));
}
for(auto &e:futures){
std::cout << e.get() << std::endl;
}
return 0;
}
This ends up with the similar results:
terminate called after throwing an instance of 'std::system_error'
what(): Unknown error -1
But this works fine (i.e. prints: 0 2 4 ... 18 as expected):
int twice(int m){
return 2*m;
}
int main(){
nsp::Foo foo; // <---- difference here !
std::vector< std::future<int> > futures;
for(int i=0;i<10;++i){
futures.push_back(std::async(twice,i));
}
for(auto &e:futures){
std::cout << e.get() << std::endl;
}
return 0;
}
nsp::Foo is now defined/declared in another library (but with the same code). This library in compiled in the same CMakeLists.txt folder with the same compilation flags. And the executable links to it.
What is going on ?
I'm trying to share a std::map<std::string, std::chrono::system_clock::time_point> map: each string is a hostname identifying a site, and the time_point is the last time a process visited that site.
I was trying with mmap but each process still see its own copy of the map.
Here's my code (I took away all the methods and variables not concerning my problem):
#include <sys/mman.h>
#include <unistd.h>
#include <iostream>
#include <map>
#include <string>
#include <chrono>
typedef std::map<std::string, std::chrono::system_clock::time_point> mymap;
typedef mymap::iterator iter;
typedef mymap* mapPointer;
class MmapManager {
private:
MmapManager() {
frequency = (mapPointer) mmap(NULL, sizeof(frequency), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
if (frequency == MAP_FAILED) {
std::cout << "mapping failed" << std::endl;
}
};
~MmapManager() {
std::cout << "~MmapManager()" << std::endl;
}
public:
// my class was designed with the singleton pattern
static MmapManager& getInstance() {
static MmapManager instance;
return instance;
}
private:
// pointer to my map
mapPointer frequency;
public:
// check if the process already visited site "host"
bool isHostAlreadyVisited(std::string host) {
return frequency->find(host) != frequency->end();
}
// add new visited site and time of the visit
void addHost(std::string host) {
(*frequency)[host] = std::chrono::system_clock::now();
std::cout << "PROC " << getpid() << " added " << host << std::endl;
}
// get time of the visit for site "host"
std::chrono::system_clock::time_point getElement(std::string host) {
return (*frequency)[host];
}
// print the map
void showMap(void) {
std::cout << "PROC " << getpid() << " prints map keys" << std::endl;
for (auto it = frequency->begin(); it != frequency->end(); ++it) {
std::cout << it->first << std::endl;
}
}
};
int main(void) {
// simulate the processes
for (int i=0; i<5; i++) {
// child process
if (fork() == 0) {
// if child never visited this site...
if (! MmapManager::getInstance().isHostAlreadyVisited("www.google.com")) {
std::cout << "PID " << getpid() << " www.google.com is new" << std::endl;
// ...add it to the map
MmapManager::getInstance().addHost("www.google.com");
}
else {
// if child already visited it, calculate
// how much time passed since last visit
auto now = std::chrono::system_clock::now();
auto before = MmapManager::getInstance().getElement("www.google.com");
std::chrono::duration<double> diff = now-before;
std::cout << "PID " << getpid() << " visited www.google.com " << diff.count() << " seconds ago" << std::endl;
}
MmapManager::getInstance().showMap();
_exit(EXIT_SUCCESS);
}
}
return 0;
}
Here's one of the possible outputs:
PID 12457 www.google.com is new
PID 12459 www.google.com is new
PID 12458 www.google.com is new
PID 12460 www.google.com is new
PID 12461 www.google.com is new
I can't use other external libraries like Boost or use threads: I know they share memory, but the program was designed this way (with child processes doing stuff) and I can't modify it (original code is not mine).
Why does each process still see its own copy of the map?
Edit: I think I did all the things you suggested me:
insertion in map is protected with a lock mechanism (thanks kfsone);
created a custom allocator for string and another for map (thanks Maxim Egorushkin for these two);
map is allocated before forking (thanks Zan Lynx).
The output is not different and map is still not shared:
MmapManager()
printMap
map empty
PID 5085 www.google.com is new
PID 5086 www.google.com is new
PROC 5086 added www.goole.com
PROC 5085 added www.goole.com
PID 5087 www.google.com is new
PROC 5087 added www.goole.com
You suggested me to use Boost but I'd like to use it after my code will work: I'm not reinventing the wheel, just learning the hard way.
Here follows my new code:
#include <sys/mman.h>
#include <unistd.h>
#include <sys/shm.h> /* shmat(), IPC_RMID */
#include <semaphore.h> /* sem_open(), sem_destroy(), sem_wait().. */
#include <fcntl.h> /* O_CREAT, O_EXEC */
#include <stdlib.h>
#include <iostream>
#include <map>
#include <string>
#include <chrono>
#include <cstddef>
#include <vector>
#include <limits>
#include <memory>
template<typename T> class stringAllocator {
public :
typedef T value_type;
typedef size_t size_type;
typedef ptrdiff_t difference_type;
typedef T* pointer;
typedef T const * const_pointer;
typedef T& reference;
typedef T const & const_reference;
template<typename U> struct rebind {
typedef stringAllocator<U> other;
};
pointer address (reference value ) const {
return &value;
}
const_pointer address (const_reference value) const {
return &value;
}
size_type max_size () const throw() {
return std::numeric_limits <size_type>::max() / sizeof(T);
}
stringAllocator () throw () {}
stringAllocator (stringAllocator const &) throw () {}
template <typename U>
stringAllocator(stringAllocator <U> const &) throw () {}
~stringAllocator() throw () {}
pointer allocate (size_type n) {
pointer ptr = (pointer)malloc(n * sizeof(value_type));
return ptr;
}
void deallocate (pointer p, size_type n) {
free(p);
}
void construct (pointer p, const_reference value) {
new(p) T(value);
}
void destroy (pointer p) {
p->~T();
}
};
template <class T1, class T2>
bool operator==(const stringAllocator<T1>&, const stringAllocator<T2>&) throw() {
return true;
}
template <class T1, class T2>
bool operator!=(const stringAllocator<T1>&, const stringAllocator<T2>&) throw() {
return false;
}
typedef std::basic_string<
char,
std::char_traits<char>,
stringAllocator<char>
> myString;
/*************************************** map allocator ****************************************/
template<typename T> class mapAllocator{
public :
typedef T value_type;
typedef value_type* pointer;
typedef const value_type* const_pointer;
typedef value_type& reference;
typedef const value_type& const_reference;
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
template<typename U>
struct rebind {
typedef mapAllocator<U> other;
};
mapAllocator() throw() {}
mapAllocator (mapAllocator const &) throw () {}
~mapAllocator() throw () {}
template<typename U>
mapAllocator(mapAllocator<U> const&) {}
pointer address(reference r) { return &r; }
const_pointer address(const_reference r) { return &r; }
pointer allocate(size_type cnt, typename std::allocator<void>::const_pointer = 0) {
pointer new_memory = reinterpret_cast<pointer>(::operator new(cnt * sizeof (T)));
return new_memory;
}
void deallocate(pointer p, size_type n) {
::operator delete(p);
}
size_type max_size() const {
return std::numeric_limits<size_type>::max() / sizeof(T);
}
void construct(pointer p, const T& t) {
new(p) T(t);
}
void destroy(pointer p) {
p->~T();
}
};
template <class T1, class T2>
bool operator==(const mapAllocator<T1>&, const mapAllocator<T2>&) throw() {
return true;
}
template <class T1, class T2>
bool operator!=(const mapAllocator<T1>&, const mapAllocator<T2>&) throw() {
return false;
}
/*************************************** end map allocator ****************************************/
// class compare for map with std::string as Key
class strless {
public:
bool operator() (const myString first, const myString second ) const {
return first.compare(second) < 0;
}
};
template<typename Key, typename T>
using Map = std::map<
Key, // class Key
T, // class T
strless, // class Compare = std::less<Key>
mapAllocator<std::pair<const Key, T> // class Allocator = std::allocator<std::pair<const Key, T> >
>
>;
// typedef for the actual map I need to share between processes
typedef Map<myString, std::chrono::system_clock::time_point> frequencyMap;
class MmapManager {
private:
MmapManager() {
std::cout << "MmapManager()" << std::endl;
semMmap = sem_open("semaphore", O_CREAT|O_EXCL, 0644, 1);
sem_unlink("semaphore");
};
~MmapManager() {
std::cout << "~MmapManager()" << std::endl;
}
public:
static MmapManager& getInstance() {
static MmapManager instance;
return instance;
}
private:
frequencyMap fmap;
sem_t *semMmap;
public:
void start(void) {}
bool isHostAlreadyVisited(myString host) {
return fmap.find(host) != fmap.end();
}
void addHost(myString host) {
sem_wait(semMmap);
fmap[host] = std::chrono::system_clock::now();
sem_post(semMmap);
std::cout << "PROC " << getpid() << " added " << host << std::endl;
}
// get time of the visit for site "host"
std::chrono::system_clock::time_point getElement(myString host) {
return fmap[host];
}
void printMap(void) {
std::cout << "printMap" << std::endl;
if (!fmap.empty()) {
for (auto it : fmap) {
std::cout << it.first << ' ';
}
std::cout << std::endl;
} else {
std::cout << "map empty" << std::endl;
}
}
};
int main(void) {
MmapManager::getInstance().start();
for (int i=0; i<3; i++) {
if (fork() == 0) {
if (!MmapManager::getInstance().isHostAlreadyVisited("www.google.com")) {
std::cout << "PID " << getpid() << " www.google.com is new" << std::endl;
MmapManager::getInstance().addHost("www.goole.com");
}
else {
// if child already visited it, calculate
// how much time passed since last visit
auto now = std::chrono::system_clock::now();
auto before = MmapManager::getInstance().getElement("www.google.com");
std::chrono::duration<double> diff = now-before;
std::cout << "PID " << getpid() << " visited www.google.com " << diff.count() << " seconds ago" << std::endl;
}
_exit(EXIT_SUCCESS);
}
}
MmapManager::getInstance().printMap();
return 0;
}
This does not work because although you placed the container object into the shared memory, the elements are still allocated from the heap and thus they are not accessible by other processes.
You need a custom allocator that allocates elements in the shared memory. See Creating maps in shared memory for how it is done.
Note that the string class you use must also allocate memory from the shared memory.
In other words, you cannot have pointers to heap memory in the shared memory, because heap memory is not shared between processes. std classes have an allocator template argument, the default one allocates memory from the heap. This needs to be changed to a shared memory allocator to be able to share such objects via shared memory.
Another reason your code doesn't work is that you only create the maps after you called fork().
If you want your MAP_SHARED|MAP_ANONYMOUS map to be seen by all the children then you have to call mmap() before forking.