Parallel QuickSort, can someone help me?

Parallel QuickSort, can someone help me? - python-3.x

I am trying to implement the quicksort parallelization by specifying the list separation snippet in two others compared to the pivo. I am having problems with the syntax and to save the pointer at the end of the two new lists. How do I get rid of the syntax errors and save the list sizes at the end of the kernel?
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda import gpuarray, compiler
from pycuda.compiler import SourceModule
import time
import numpy as np
def quickSort_paralleloGlobal(listElements: list) -> list:
if len(listElements) <= 1:
return listElements
else:
pivo = listElements.pop()
list1 = []
list2 = []
kernel_code_template = """
__global__ void separateQuick(int *listElements, int *list1, int *list2, int pivo)
{
int index1 = 0, index2 = 0;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < %(ARRAY_SIZE)s; i+= stride)
if (lista[i] < pivo
{
list1[index2] = listElements[i];
index1++;
}
else
{
list2[index2] = listElements[i];
index2++;
}
}
"""
SIZE = len(listElements)
listElements = np.asarray(listElements)
listElements = listElements.astype(np.int)
lista_gpu = cuda.mem_alloc(listElements.nbytes)
cuda.memcpy_htod(lista_gpu, listElements)
list1_gpu = cuda.mem_alloc(listElements.nbytes)
list2_gpu = cuda.mem_alloc(listElements.nbytes)
BLOCK_SIZE = 256
NUM_BLOCKS = (SIZE + BLOCK_SIZE - 1) // BLOCK_SIZE
kernel_code = kernel_code_template % {
'ARRAY_SIZE': SIZE
}
mod = compiler.SourceModule(kernel_code)
arraysQuick = mod.get_function("separateQuick")
arraysQuick(lista_gpu, list1_gpu, list2_gpu, pivo, block=(BLOCK_SIZE, 1, 1), grid=(NUM_BLOCKS, 1))
list1 = list1_gpu.get()
list2 = list2_gpu.get()
np.allclose(list1, list1_gpu.get())
np.allclose(list2, list2_gpu.get())
return quickSort_paralleloGlobal(list1) + [pivo] + quickSort_paralleloGlobal(list2)
Here is the runtime error:
Traceback (most recent call last):
File "C:/Users/mateu/Documents/GitHub/ppc_Sorting_and_Merging/quickSort.py", line 104, in <module>
print(quickSort_paraleloGlobal([1, 5, 4, 2, 0]))
File "C:/Users/mateu/Documents/GitHub/ppc_Sorting_and_Merging/quickSort.py", line 60, in quickSort_paraleloGlobal
mod = compiler.SourceModule(kernel_code)
File "C:\Users\mateu\Documents\GitHub\ppc_Sorting_and_Merging\venv\lib\site-packages\pycuda\compiler.py", line 291, in __init__
arch, code, cache_dir, include_dirs)
File "C:\Users\mateu\Documents\GitHub\ppc_Sorting_and_Merging\venv\lib\site-packages\pycuda\compiler.py", line 254, in compile
return compile_plain(source, options, keep, nvcc, cache_dir, target)
File "C:\Users\mateu\Documents\GitHub\ppc_Sorting_and_Merging\venv\lib\site-packages\pycuda\compiler.py", line 137, in compile_plain
stderr=stderr.decode("utf-8", "replace"))
pycuda.driver.CompileError: nvcc compilation of C:\Users\mateu\AppData\Local\Temp\tmpefxgkfkk\kernel.cu failed
[command: nvcc --cubin -arch sm_61 -m64 -Ic:\users\mateu\documents\github\ppc_sorting_and_merging\venv\lib\site-packages\pycuda\cuda kernel.cu]
[stdout:
kernel.cu
]
[stderr:
kernel.cu(10): error: expected a ")"
kernel.cu(19): warning: parsing restarts here after previous syntax error
kernel.cu(19): error: expected a statement
kernel.cu(5): warning: variable "indexMenor" was declared but never referenced
kernel.cu(5): warning: variable "indexMaior" was declared but never referenced
2 errors detected in the compilation of "C:/Users/mateu/AppData/Local/Temp/tmpxft_00004260_00000000-10_kernel.cpp1.ii".
]
Process finished with exit code 1

There are a number of problems with your code. I don't think I will be able to list them all. However one of the central problems is that you have attempted to do a naive conversion of a serial quicksort into a thread-parallel quicksort, and such a simple conversion is not possible.
To allow threads to work in a parallel fashion, while dividing up an input list into one of two separate output lists, requires a number of changes to your kernel code.
However we can address most of the other issues by limiting your kernel launches to one thread each.
With that idea, the following code appears to sort the given input correctly:
$ cat t18.py
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda import gpuarray, compiler
from pycuda.compiler import SourceModule
import time
import numpy as np
def quickSort_paralleloGlobal(listElements):
if len(listElements) <= 1:
return listElements
else:
pivo = listElements.pop()
pivo = np.int32(pivo)
kernel_code_template = """
__global__ void separateQuick(int *listElements, int *list1, int *list2, int *l1_size, int *l2_size, int pivo)
{
int index1 = 0, index2 = 0;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < %(ARRAY_SIZE)s; i+= stride)
if (listElements[i] < pivo)
{
list1[index1] = listElements[i];
index1++;
}
else
{
list2[index2] = listElements[i];
index2++;
}
*l1_size = index1;
*l2_size = index2;
}
"""
SIZE = len(listElements)
listElements = np.asarray(listElements)
listElements = listElements.astype(np.int32)
lista_gpu = cuda.mem_alloc(listElements.nbytes)
cuda.memcpy_htod(lista_gpu, listElements)
list1_gpu = cuda.mem_alloc(listElements.nbytes)
list2_gpu = cuda.mem_alloc(listElements.nbytes)
l1_size = cuda.mem_alloc(4)
l2_size = cuda.mem_alloc(4)
BLOCK_SIZE = 1
NUM_BLOCKS = 1
kernel_code = kernel_code_template % {
'ARRAY_SIZE': SIZE
}
mod = compiler.SourceModule(kernel_code)
arraysQuick = mod.get_function("separateQuick")
arraysQuick(lista_gpu, list1_gpu, list2_gpu, l1_size, l2_size, pivo, block=(BLOCK_SIZE, 1, 1), grid=(NUM_BLOCKS, 1))
l1_sh = np.zeros(1, dtype = np.int32)
l2_sh = np.zeros(1, dtype = np.int32)
cuda.memcpy_dtoh(l1_sh, l1_size)
cuda.memcpy_dtoh(l2_sh, l2_size)
list1 = np.zeros(l1_sh, dtype=np.int32)
list2 = np.zeros(l2_sh, dtype=np.int32)
cuda.memcpy_dtoh(list1, list1_gpu)
cuda.memcpy_dtoh(list2, list2_gpu)
list1 = list1.tolist()
list2 = list2.tolist()
return quickSort_paralleloGlobal(list1) + [pivo] + quickSort_paralleloGlobal(list2)
print(quickSort_paralleloGlobal([1, 5, 4, 2, 0]))
$ python t18.py
[0, 1, 2, 4, 5]
$
The next step in the porting process would be to convert your naive serial kernel to one that could operate in a thread-parallel fashion. One relatively simple approach would be to use atomics to manage all output data (both lists, as well as updates to the sizes of each list).
Here is one possible approach:
$ cat t18.py
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda import gpuarray, compiler
from pycuda.compiler import SourceModule
import time
import numpy as np
def quickSort_paralleloGlobal(listElements):
if len(listElements) <= 1:
return listElements
else:
pivo = listElements.pop()
pivo = np.int32(pivo)
kernel_code_template = """
__global__ void separateQuick(int *listElements, int *list1, int *list2, int *l1_size, int *l2_size, int pivo)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < %(ARRAY_SIZE)s; i+= stride)
if (listElements[i] < pivo)
{
list1[atomicAdd(l1_size, 1)] = listElements[i];
}
else
{
list2[atomicAdd(l2_size, 1)] = listElements[i];
}
}
"""
SIZE = len(listElements)
listElements = np.asarray(listElements)
listElements = listElements.astype(np.int32)
lista_gpu = cuda.mem_alloc(listElements.nbytes)
cuda.memcpy_htod(lista_gpu, listElements)
list1_gpu = cuda.mem_alloc(listElements.nbytes)
list2_gpu = cuda.mem_alloc(listElements.nbytes)
l1_size = cuda.mem_alloc(4)
l2_size = cuda.mem_alloc(4)
BLOCK_SIZE = 256
NUM_BLOCKS = (SIZE + BLOCK_SIZE - 1) // BLOCK_SIZE
kernel_code = kernel_code_template % {
'ARRAY_SIZE': SIZE
}
mod = compiler.SourceModule(kernel_code)
arraysQuick = mod.get_function("separateQuick")
l1_sh = np.zeros(1, dtype = np.int32)
l2_sh = np.zeros(1, dtype = np.int32)
cuda.memcpy_htod(l1_size, l1_sh)
cuda.memcpy_htod(l2_size, l2_sh)
arraysQuick(lista_gpu, list1_gpu, list2_gpu, l1_size, l2_size, pivo, block=(BLOCK_SIZE, 1, 1), grid=(NUM_BLOCKS, 1))
cuda.memcpy_dtoh(l1_sh, l1_size)
cuda.memcpy_dtoh(l2_sh, l2_size)
list1 = np.zeros(l1_sh, dtype=np.int32)
list2 = np.zeros(l2_sh, dtype=np.int32)
cuda.memcpy_dtoh(list1, list1_gpu)
cuda.memcpy_dtoh(list2, list2_gpu)
list1 = list1.tolist()
list2 = list2.tolist()
return quickSort_paralleloGlobal(list1) + [pivo] + quickSort_paralleloGlobal(list2)
print(quickSort_paralleloGlobal([1, 5, 4, 2, 0]))
$ python t18.py
[0, 1, 2, 4, 5]
$
I'm not suggesting that the above examples are perfect or defect free. Also, I have not identified each and every change I made to your code. I suggest you study the differences between these examples and your posted code.
I should also mention that this isn't a fast or efficient way to sort numbers on the GPU. I assume this is for a learning exercise. If you're interested in fast parallel sorting, you are encouraged to use a library implementation. If you want to do this from python, one possible implementation is provided by cupy

Related

Calculating number of minimum swaps to sort array (selection sort is too slow) [duplicate]

I'm working on sorting an integer sequence with no identical numbers (without loss of generality, let's assume the sequence is a permutation of 1,2,...,n) into its natural increasing order (i.e. 1,2,...,n). I was thinking about directly swapping the elements (regardless of the positions of elements; in other words, a swap is valid for any two elements) with minimal number of swaps (the following may be a feasible solution):
Swap two elements with the constraint that either one or both of them should be swapped into the correct position(s). Until every element is put in its correct position.
But I don't know how to mathematically prove if the above solution is optimal. Anyone can help?

I was able to prove this with graph-theory. Might want to add that tag in :)
Create a graph with n vertices. Create an edge from node n_i to n_j if the element in position i should be in position j in the correct ordering. You will now have a graph consisting of several non-intersecting cycles. I argue that the minimum number of swaps needed to order the graph correctly is
M = sum (c in cycles) size(c) - 1
Take a second to convince yourself of that...if two items are in a cycle, one swap can just take care of them. If three items are in a cycle, you can swap a pair to put one in the right spot, and a two-cycle remains, etc. If n items are in a cycle, you need n-1 swaps. (This is always true even if you don't swap with immediate neighbors.)
Given that, you may now be able to see why your algorithm is optimal. If you do a swap and at least one item is in the right position, then it will always reduce the value of M by 1. For any cycle of length n, consider swapping an element into the correct spot, occupied by its neighbor. You now have a correctly ordered element, and a cycle of length n-1.
Since M is the minimum number of swaps, and your algorithm always reduces M by 1 for each swap, it must be optimal.

All the cycle counting is very difficult to keep in your head. There is a way that is much simpler to memorize.
First, let's go through a sample case manually.
Sequence: [7, 1, 3, 2, 4, 5, 6]
Enumerate it: [(0, 7), (1, 1), (2, 3), (3, 2), (4, 4), (5, 5), (6, 6)]
Sort the enumeration by value: [(1, 1), (3, 2), (2, 3), (4, 4), (5, 5), (6, 6), (0, 7)]
Start from the beginning. While the index is different from the enumerated index keep on swapping the elements defined by index and enumerated index. Remember: swap(0,2);swap(0,3) is the same as swap(2,3);swap(0,2)
swap(0, 1) => [(3, 2), (1, 1), (2, 3), (4, 4), (5, 5), (6, 6), (0, 7)]
swap(0, 3) => [(4, 4), (1, 1), (2, 3), (3, 2), (5, 5), (6, 6), (0, 7)]
swap(0, 4) => [(5, 5), (1, 1), (2, 3), (3, 2), (4, 4), (6, 6), (0, 7)]
swap(0, 5) => [(6, 6), (1, 1), (2, 3), (3, 2), (4, 4), (5, 5), (0, 7)]
swap(0, 6) => [(0, 7), (1, 1), (2, 3), (3, 2), (4, 4), (5, 5), (6, 6)]
I.e. semantically you sort the elements and then figure out how to put them to the initial state via swapping through the leftmost item that is out of place.
Python algorithm is as simple as this:
def swap(arr, i, j):
arr[i], arr[j] = arr[j], arr[i]
def minimum_swaps(arr):
annotated = [*enumerate(arr)]
annotated.sort(key = lambda it: it[1])
count = 0
i = 0
while i < len(arr):
if annotated[i][0] == i:
i += 1
continue
swap(annotated, i, annotated[i][0])
count += 1
return count
Thus, you don't need to memorize visited nodes or compute some cycle length.

For your reference, here is an algorithm that I wrote, to generate the minimum number of swaps needed to sort the array. It finds the cycles as described by #Andrew Mao.
/**
* Finds the minimum number of swaps to sort given array in increasing order.
* #param ar array of <strong>non-negative distinct</strong> integers.
* input array will be overwritten during the call!
* #return min no of swaps
*/
public int findMinSwapsToSort(int[] ar) {
int n = ar.length;
Map<Integer, Integer> m = new HashMap<>();
for (int i = 0; i < n; i++) {
m.put(ar[i], i);
}
Arrays.sort(ar);
for (int i = 0; i < n; i++) {
ar[i] = m.get(ar[i]);
}
m = null;
int swaps = 0;
for (int i = 0; i < n; i++) {
int val = ar[i];
if (val < 0) continue;
while (val != i) {
int new_val = ar[val];
ar[val] = -1;
val = new_val;
swaps++;
}
ar[i] = -1;
}
return swaps;
}

We do not need to swap the actual elements, just find how many elements are not in the right index (Cycle).
The min swaps will be Cycle - 1;
Here is the code...
static int minimumSwaps(int[] arr) {
int swap=0;
boolean visited[]=new boolean[arr.length];
for(int i=0;i<arr.length;i++){
int j=i,cycle=0;
while(!visited[j]){
visited[j]=true;
j=arr[j]-1;
cycle++;
}
if(cycle!=0)
swap+=cycle-1;
}
return swap;
}

#Archibald, I like your solution, and such was my initial assumptions that sorting the array would be the simplest solution, but I don't see the need to go through the effort of the reverse-traverse as I've dubbed it, ie enumerating then sorting the array and then computing the swaps for the enums.
I find it simpler to subtract 1 from each element in the array and then to compute the swaps required to sort that list
here is my tweak/solution:
def swap(arr, i, j):
tmp = arr[i]
arr[i] = arr[j]
arr[j] = tmp
def minimum_swaps(arr):
a = [x - 1 for x in arr]
swaps = 0
i = 0
while i < len(a):
if a[i] == i:
i += 1
continue
swap(a, i, a[i])
swaps += 1
return swaps
As for proving optimality, I think #arax has a good point.

// Assuming that we are dealing with only sequence started with zero
function minimumSwaps(arr) {
var len = arr.length
var visitedarr = []
var i, start, j, swap = 0
for (i = 0; i < len; i++) {
if (!visitedarr[i]) {
start = j = i
var cycleNode = 1
while (arr[j] != start) {
j = arr[j]
visitedarr[j] = true
cycleNode++
}
swap += cycleNode - 1
}
}
return swap
}

I really liked the solution of #Ieuan Uys in Python.
What I improved on his solution;
While loop is iterated one less to increase speed; while i < len(a) - 1
Swap function is de-capsulated to make one, single function.
Extensive code comments are added to increase readability.
My code in python.
def minimumSwaps(arr):
#make array values starting from zero to match index values.
a = [x - 1 for x in arr]
#initialize number of swaps and iterator.
swaps = 0
i = 0
while i < len(a)-1:
if a[i] == i:
i += 1
continue
#swap.
tmp = a[i] #create temp variable assign it to a[i]
a[i] = a[tmp] #assign value of a[i] with a[tmp]
a[tmp] = tmp #assign value of a[tmp] with tmp (or initial a[i])
#calculate number of swaps.
swaps += 1
return swaps
Detailed explanation on what code does on an array with size n;
We check every value except last one (n-1 iterations) in the array one by one. If the value does not match with array index, then we send this value to its place where index value is equal to its value. For instance, if at a[0] = 3. Then this value should swap with a[3]. a[0] and a[3] is swapped. Value 3 will be at a[3] where it is supposed to be. One value is sent to its place. We have n-2 iteration left. I am not interested what is now a[0]. If it is not 0 at that location, it will be swapped by another value latter. Because that another value also exists in a wrong place, this will be recognized by while loop latter.
Real Example
a[4, 2, 1, 0, 3]
#iteration 0, check a[0]. 4 should be located at a[4] where the value is 3. Swap them.
a[3, 2, 1, 0, 4] #we sent 4 to the right location now.
#iteration 1, check a[1]. 2 should be located at a[2] where the value is 1. Swap them.
a[3, 1, 2, 0, 4] #we sent 2 to the right location now.
#iteration 2, check a[2]. 2 is already located at a[2]. Don't do anything, continue.
a[3, 1, 2, 0, 4]
#iteration 3, check a[3]. 0 should be located at a[0] where the value is 3. Swap them.
a[0, 1, 2, 3, 4] #we sent 0 to the right location now.
# There is no need to check final value of array. Since all swaps are done.

Nicely done solution by #bekce. If using C#, the initial code of setting up the modified array ar can be succinctly expressed as:
var origIndexes = Enumerable.Range(0, n).ToArray();
Array.Sort(ar, origIndexes);
then use origIndexes instead of ar in the rest of the code.

Swift 4 version:
func minimumSwaps(arr: [Int]) -> Int {
struct Pair {
let index: Int
let value: Int
}
var positions = arr.enumerated().map { Pair(index: $0, value: $1) }
positions.sort { $0.value < $1.value }
var indexes = positions.map { $0.index }
var swaps = 0
for i in 0 ..< indexes.count {
var val = indexes[i]
if val < 0 {
continue // Already visited.
}
while val != i {
let new_val = indexes[val]
indexes[val] = -1
val = new_val
swaps += 1
}
indexes[i] = -1
}
return swaps
}

This is the sample code in C++ that finds the minimum number of swaps to sort a permutation of the sequence of (1,2,3,4,5,.......n-2,n-1,n)
#include<bits/stdc++.h>
using namespace std;
int main()
{
int n,i,j,k,num = 0;
cin >> n;
int arr[n+1];
for(i = 1;i <= n;++i)cin >> arr[i];
for(i = 1;i <= n;++i)
{
if(i != arr[i])// condition to check if an element is in a cycle r nt
{
j = arr[i];
arr[i] = 0;
while(j != 0)// Here i am traversing a cycle as mentioned in
{ // first answer
k = arr[j];
arr[j] = j;
j = k;
num++;// reducing cycle by one node each time
}
num--;
}
}
for(i = 1;i <= n;++i)cout << arr[i] << " ";cout << endl;
cout << num << endl;
return 0;
}

Solution using Javascript.
First I set all the elements with their current index that need to be ordered, and then I iterate over the map to order only the elements that need to be swapped.
function minimumSwaps(arr) {
const mapUnorderedPositions = new Map()
for (let i = 0; i < arr.length; i++) {
if (arr[i] !== i+1) {
mapUnorderedPositions.set(arr[i], i)
}
}
let minSwaps = 0
while (mapUnorderedPositions.size > 1) {
const currentElement = mapUnorderedPositions.entries().next().value
const x = currentElement[0]
const y = currentElement[1]
// Skip element in map if its already ordered
if (x-1 !== y) {
// Update unordered position index of swapped element
mapUnorderedPositions.set(arr[x-1], y)
// swap in array
arr[y] = arr[x-1]
arr[x-1] = x
// Increment swaps
minSwaps++
}
mapUnorderedPositions.delete(x)
}
return minSwaps
}
If you have an input like 7 2 4 3 5 6 1, this is how the debugging will go:
Map { 7 => 0, 4 => 2, 3 => 3, 1 => 6 }
currentElement [ 7, 0 ]
swapping 1 with 7
[ 1, 2, 4, 3, 5, 6, 7 ]
currentElement [ 4, 2 ]
swapping 3 with 4
[ 1, 2, 3, 4, 5, 6, 7 ]
currentElement [ 3, 2 ]
skipped
minSwaps = 2

Finding the minimum number of swaps required to put a permutation of 1..N in order.
We can use that the we know what the sort result would be: 1..N, which means we don't actually have to do swaps just count them.
The shuffling of 1..N is called a permutation, and is composed of disjoint cyclic permutations, for example, this permutation of 1..6:
1 2 3 4 5 6
6 4 2 3 5 1
Is composed of the cyclic permutations (1,6)(2,4,3)(5)
1->6(->1) cycle: 1 swap
2->4->3(->2) cycle: 2 swaps
5(->5) cycle: 0 swaps
So a cycle of k elements requires k-1 swaps to put in order.
Since we know where each element "belongs" (i.e. value k belongs at position k-1) we can easily traverse the cycle. Start at 0, we get 6, which belongs at 5,
and there we find 1, which belongs at 0 and we're back where we started.
To avoid re-counting a cycle later, we track which elements were visited - alternatively you could perform the swaps so that the elements are in the right place when you visit them later.
The resulting code:
def minimumSwaps(arr):
visited = [False] * len(arr)
numswaps = 0
for i in range(len(arr)):
if not visited[i]:
visited[i] = True
j = arr[i]-1
while not visited[j]:
numswaps += 1
visited[j] = True
j = arr[j]-1
return numswaps

An implementation on integers with primitive types in Java (and tests).
import java.util.Arrays;
public class MinSwaps {
public static int computate(int[] unordered) {
int size = unordered.length;
int[] ordered = order(unordered);
int[] realPositions = realPositions(ordered, unordered);
boolean[] touchs = new boolean[size];
Arrays.fill(touchs, false);
int i;
int landing;
int swaps = 0;
for(i = 0; i < size; i++) {
if(!touchs[i]) {
landing = realPositions[i];
while(!touchs[landing]) {
touchs[landing] = true;
landing = realPositions[landing];
if(!touchs[landing]) { swaps++; }
}
}
}
return swaps;
}
private static int[] realPositions(int[] ordered, int[] unordered) {
int i;
int[] positions = new int[unordered.length];
for(i = 0; i < unordered.length; i++) {
positions[i] = position(ordered, unordered[i]);
}
return positions;
}
private static int position(int[] ordered, int value) {
int i;
for(i = 0; i < ordered.length; i++) {
if(ordered[i] == value) {
return i;
}
}
return -1;
}
private static int[] order(int[] unordered) {
int[] ordered = unordered.clone();
Arrays.sort(ordered);
return ordered;
}
}
Tests
import org.junit.Test;
import static org.junit.Assert.assertEquals;
public class MinimumSwapsSpec {
#Test
public void example() {
// setup
int[] unordered = new int[] { 40, 23, 1, 7, 52, 31 };
// run
int minSwaps = MinSwaps.computate(unordered);
// verify
assertEquals(5, minSwaps);
}
#Test
public void example2() {
// setup
int[] unordered = new int[] { 4, 3, 2, 1 };
// run
int minSwaps = MinSwaps.computate(unordered);
// verify
assertEquals(2, minSwaps);
}
#Test
public void example3() {
// setup
int[] unordered = new int[] {1, 5, 4, 3, 2};
// run
int minSwaps = MinSwaps.computate(unordered);
// verify
assertEquals(2, minSwaps);
}
}

Swift 4.2:
func minimumSwaps(arr: [Int]) -> Int {
let sortedValueIdx = arr.sorted().enumerated()
.reduce(into: [Int: Int](), { $0[$1.element] = $1.offset })
var checked = Array(repeating: false, count: arr.count)
var swaps = 0
for idx in 0 ..< arr.count {
if checked[idx] { continue }
var edges = 1
var cursorIdx = idx
while true {
let cursorEl = arr[cursorIdx]
let targetIdx = sortedValueIdx[cursorEl]!
if targetIdx == idx {
break
} else {
cursorIdx = targetIdx
edges += 1
}
checked[targetIdx] = true
}
swaps += edges - 1
}
return swaps
}

Python code
A = [4,3,2,1]
count = 0
for i in range (len(A)):
min_idx = i
for j in range (i+1,len(A)):
if A[min_idx] > A[j]:
min_idx = j
if min_idx > i:
A[i],A[min_idx] = A[min_idx],A[i]
count = count + 1
print "Swap required : %d" %count

In Javascript
If the count of the array starts with 1
function minimumSwaps(arr) {
var len = arr.length
var visitedarr = []
var i, start, j, swap = 0
for (i = 0; i < len; i++) {
if (!visitedarr[i]) {
start = j = i
var cycleNode = 1
while (arr[j] != start + 1) {
j = arr[j] - 1
visitedarr[j] = true
cycleNode++
}
swap += cycleNode - 1
}
}
return swap
}
else for input starting with 0
function minimumSwaps(arr) {
var len = arr.length
var visitedarr = []
var i, start, j, swap = 0
for (i = 0; i < len; i++) {
if (!visitedarr[i]) {
start = j = i
var cycleNode = 1
while (arr[j] != start) {
j = arr[j]
visitedarr[j] = true
cycleNode++
}
swap += cycleNode - 1
}
}
return swap
}
Just extending Darshan Puttaswamy code for current HackerEarth inputs

Here's a solution in Java for what #Archibald has already explained.
static int minimumSwaps(int[] arr){
int swaps = 0;
int[] arrCopy = arr.clone();
HashMap<Integer, Integer> originalPositionMap
= new HashMap<>();
for(int i = 0 ; i < arr.length ; i++){
originalPositionMap.put(arr[i], i);
}
Arrays.sort(arr);
for(int i = 0 ; i < arr.length ; i++){
while(arr[i] != arrCopy[i]){
//swap
int temp = arr[i];
arr[i] = arr[originalPositionMap.get(temp)];
arr[originalPositionMap.get(temp)] = temp;
swaps += 1;
}
}
return swaps;
}

def swap_sort(arr)
changes = 0
loop do
# Find a number that is out-of-place
_, i = arr.each_with_index.find { |val, index| val != (index + 1) }
if i != nil
# If such a number is found, then `j` is the position that the out-of-place number points to.
j = arr[i] - 1
# Swap the out-of-place number with number from position `j`.
arr[i], arr[j] = arr[j], arr[i]
# Increase swap counter.
changes += 1
else
# If there are no out-of-place number, it means the array is sorted, and we're done.
return changes
end
end
end

Apple Swift version 5.2.4
func minimumSwaps(arr: [Int]) -> Int {
var swapCount = 0
var arrayPositionValue = [(Int, Int)]()
var visitedDictionary = [Int: Bool]()
for (index, number) in arr.enumerated() {
arrayPositionValue.append((index, number))
visitedDictionary[index] = false
}
arrayPositionValue = arrayPositionValue.sorted{ $0.1 < $1.1 }
for i in 0..<arr.count {
var cycleSize = 0
var visitedIndex = i
while !visitedDictionary[visitedIndex]! {
visitedDictionary[visitedIndex] = true
visitedIndex = arrayPositionValue[visitedIndex].0
cycleSize += 1
}
if cycleSize > 0 {
swapCount += cycleSize - 1
}
}
return swapCount
}

Go version 1.17:
func minimumSwaps(arr []int32) int32 {
var swap int32
for i := 0; i < len(arr) - 1; i++{
for j := 0; j < len(arr); j++ {
if arr[j] > arr[i] {
arr[i], arr[j] = arr[j], arr[i]
swap++
}else {
continue
}
}
}
return swap
}

Error: TypeError: cannot perform reduce with flexible type

i am using python version 3.7.Below is the code in which I am performing operation along the rows. i want the mean of the data which are along the rows but I get an error. i am new to numpy and python. i am reading the data from text file.
My code is:
import numpy as np
def getIndexFromDatetime(date_from, date_to):
'''date_from = [2, 10] : 10oclock of day2
'''
if date_from[1] > 24 or date_to[1] > 24: print('error')
start = (date_from[0] - 1) * 48 + date_from[1] * 2
end = (date_to[0] - 1) * 48 + date_to[1] * 2
return [start, end]
def is_num(s):
return s.replace(',', '').replace('.', '').replace('-', '').isnumeric()
def get_dataset(fpath):
with open(fpath, 'r') as f:
cnt = 0
DataWeather = {}
header = []
dtime = []
temp1 = []
temp2 = []
for line in f:
terms = line.split('\t')
#print(terms)
if cnt == 0: header1 = terms
if cnt == 1: header2 = terms
#header.append(terms[3])
cnt += 1
if cnt == 2:
for i in range(len(header1)):
header.append(header1[i]+header2[i])
#print(header)
for i in range(len(terms)):
DataWeather[header[i]] = []
#break
if cnt > 2:
for i in range(len(terms)):
if is_num(terms[i]):
DataWeather[header[i]].append(float(terms[i]))
else:
DataWeather[header[i]].append(terms[i])
for i in range(len(DataWeather[header[0]])):
dtime.append(DataWeather[header[0]][i]+' '+DataWeather[header[1]][i])
return DataWeather, header
def get_data(dataset, header, idx):
y = dataset[header][idx[0]:idx[1]]
return y
data_dir = 'weather_data'
month = 3
day = list(range(1,10))
header_idx = [2,3,4,5,7,16]
for d in day:
print(d)
dtime_from = [d, 9]
dtime_to = [d, 18]
dtime_idx = getIndexFromDatetime(dtime_from, dtime_to)
fpath = '{0}/2019-{1:02}.txt'.format(data_dir, month)
dataset, header = get_dataset(fpath)
for h in header_idx:
print(fpath)
print(header[h], dtime_from, dtime_to, dtime_idx)
data = get_data(dataset, header[h], dtime_idx)
#data= list(map(float,np.array(data)))
#data = map(np.array(data, dtype=np.float))
print(data)
print(np.mean(data))
i am getting the following error:
ret = umr_sum(arr, axis, dtype, out, keepdims)
TypeError: cannot perform reduce with flexible type
i also tried some functions like "map" and "list" as commented in the code still it gives error of converting string to float.

Mermory issue by using a for loop in R of C++ function using Rcpp

there is something unclear by using for loop with Rcpp function. here is a simple example that should help:
This is my cpp code in file test_cpp.cpp
#include <RcppArmadillo.h>
// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::export]]
arma::mat test_Cpp(int n,
arma::vec my_vec,
Rcpp::List my_list,
int mat_size,
double lambda,
double beta) {
// Matrix of mat_size rows & mat_size columns (filled with 0)
arma::mat matrix_out(mat_size, mat_size) ;
for (int it = 0 ; it < n ; ++it) {
arma::mat temp_mat_flux_convol = my_list[it] ;
if (my_vec[it] != 0) {
matrix_out += lambda * my_vec[it] * beta * temp_mat_flux_convol ;
}
}
return matrix_out ;
}
Then from the R code why res1 and res2 are different when used in a 'useless' for loop and the same without for loop? I guess there is a segfault stuff, but I did not get it!
library(Rcpp)
library(RcppArmadillo)
sourceCpp(file = "src/test_cpp.cpp")
set.seed(123)
ls_rand = lapply(1:10, function(x) matrix(rnorm(9), ncol=3))
for(i in 1:1){
res1 <- test_Cpp(n = 10,
my_vec = 1:100,
my_list = ls_rand,
mat_size = 3,
lambda = 24,
beta = 0.4)
res2 <- test_Cpp(n = 10,
my_vec = 1:100,
my_list = ls_rand,
mat_size = 3,
lambda = 24,
beta = 0.4)
}
all.equal(res1, res2)
res1 ; res2 # here res2 is twice res1 !!!
## Without for loop
res1 <- test_Cpp(n = 10,
my_vec = 1:100,
my_list = ls_rand,
mat_size = 3,
lambda = 24,
beta = 0.4)
res2 <- test_Cpp(n = 10,
my_vec = 1:100,
my_list = ls_rand,
mat_size = 3,
lambda = 24,
beta = 0.4)
all.equal(res1, res2)
res1 ; res2 # here res1 and res2 are the same!

The error lies here:
// Matrix of mat_size rows & mat_size columns (filled with 0)
arma::mat matrix_out(mat_size, mat_size) ;
The documentation says:
mat(n_rows, n_cols) (memory is not initialised)
mat(n_rows, n_cols, fill_type) (memory is initialised)
So if you change your code to
// Matrix of mat_size rows & mat_size columns (filled with 0)
arma::mat matrix_out(mat_size, mat_size, arma::fill::zeros) ;
The comment is actually right and the problem goes away.

glUseProgram gives Invalid Operation error when trying to use a Compute Shader

I'm trying to use OpenGL in Python to play with Compute Shaders, but I have very little idea what I'm doing. I'd like to be able to send in some data, have the shader alter it, and read that data back out. This is the code I've written so far:
import OpenGL
from OpenGL.GL import *
from OpenGL.GL import shaders
from OpenGL.GLU import *
import pygame
from pygame.locals import *
import numpy as np
buffer_data = np.array([0, 0, 0, 0])
compute_shader_code = """
#version 430 core
layout(std430, binding=9) buffer data{
vec4 Data[];
};
void main()
{
Data[0].x = 1;
Data[0].y = 2;
Data[0].z = 3;
Data[0].w = 4;
}
"""
def setup():
pygame.init()
window_width = 1000
window_height = 800
display = (window_width, window_height)
pygame.display.set_mode(display, DOUBLEBUF|OPENGL)
test()
pygame.quit()
quit()
def test():
print("Creating shader program")
compute_shader_program = -1
shader_program = -1
compute_shader_program = glCreateShader(GL_COMPUTE_SHADER)
glShaderSource(compute_shader_program, compute_shader_code)
glCompileShader(compute_shader_program)
if glGetShaderiv(compute_shader_program, GL_COMPILE_STATUS) != GL_TRUE:
raise RuntimeError(glGetShaderInfoLog(compute_shader_program))
shader_program = glCreateProgram()
glAttachShader(shader_program, compute_shader_program)
glLinkProgram(shader_program)
if shader_program != -1:
ssbo = -1
print("Binding buffers")
glUseProgram(shader_program)
glGenBuffers(1, ssbo)
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo)
glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo)
print("Calling compute")
glDispatchCompute(1, 1, 1)
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT)
block_index = glGetProgramResourceIndex(shader_program, GL_SHADER_STORAGE_BLOCK, "data", 9)
if block_index != GL_INVALID_INDEX:
print("I think I found the data")
# How do I access the data here
glUnmapBuffer(GL_SHADER_STORAGE_BUFFER)
setup()
When I run this program I get this error:
File "some location/Compute Shader Test.py", line 56, in test
glUseProgram(shader_program)
File "some location\Anaconda3\envs\Regression\lib\site-packages\OpenGL\platform\baseplatform.py", line 402, in __call__
return self( *args, **named )
File "errorchecker.pyx", line 53, in OpenGL_accelerate.errorchecker._ErrorChecker.glCheckError (src\errorchecker.c:1218)
OpenGL.error.GLError: GLError(
err = 1282,
description = b'invalid operation',
baseOperation = glUseProgram,
cArguments = (2,)
)
So I guess I have 2 questions. First, why am I getting this error? The shader program seems to compile just fine and on my computer it is given an integer value of 2. Am I creating it incorrectly? Is this not how you use compute shaders?
Second, how can I read data back out after the shader has been run?
Thanks a lot!
***************Update***************
After some suggestions from the comments I have it running without crashing. Now the problem is the value that comes back from the card is strange.
Updated code:
import OpenGL
from OpenGL.GL import *
from OpenGL.GL import shaders
from OpenGL.GLU import *
import pygame
from pygame.locals import *
import numpy as np
buffer_data = np.array([0, 0, 0, 0])
compute_shader_code = """
#version 430 core
layout(local_size_x = 1, local_size_y = 1) in;
layout(std430, binding=9) buffer data{
vec4 Data[];
};
void main()
{
Data[0].x = 1;
Data[0].y = 2;
Data[0].z = 3;
Data[0].w = 4;
}
"""
def setup():
pygame.init()
window_width = 1000
window_height = 800
display = (window_width, window_height)
pygame.display.set_mode(display, DOUBLEBUF|OPENGL)
print(glGetString(GL_VERSION))
test()
pygame.quit()
quit()
def test():
print("Creating shader program")
compute_shader_program = -1
shader_program = -1
compute_shader_program = glCreateShader(GL_COMPUTE_SHADER)
glShaderSource(compute_shader_program, compute_shader_code)
glCompileShader(compute_shader_program)
if glGetShaderiv(compute_shader_program, GL_COMPILE_STATUS) != GL_TRUE:
raise RuntimeError(glGetShaderInfoLog(compute_shader_program))
shader_program = glCreateProgram()
glAttachShader(shader_program, compute_shader_program)
glLinkProgram(shader_program)
print(glGetProgramInfoLog(shader_program))
if shader_program != -1:
ssbo = -1
print("Binding buffers")
glUseProgram(shader_program)
glGenBuffers(1, ssbo)
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo)
glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo)
glBufferData(GL_SHADER_STORAGE_BUFFER, np.ascontiguousarray(buffer_data, dtype=np.float32), GL_DYNAMIC_READ)
print("Calling compute")
glDispatchCompute(1, 1, 1)
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT)
block_index = glGetProgramResourceIndex(shader_program, GL_SHADER_STORAGE_BLOCK, "data", 9)
if block_index != GL_INVALID_INDEX:
print("I think I found the data")
# How do I access the data here
result = glMapBuffer(GL_SHADER_STORAGE_BUFFER, GL_READ_ONLY)
print(result)
print(buffer_data)
glUnmapBuffer(GL_SHADER_STORAGE_BUFFER)
setup()
When I print result I get 2109824291840 which is obviously not what I was expecting. I would appreciate any help on how to properly read data back from the buffer. Thanks again!

PyOpenCL how to modify a matrix locally within the kernel function

I am trying to modify a matrix (Pbis) locally within a pyOpenCL kernel function and when filling up this matrix with 0 it alters the result matrix R. When executing this code we obtain weird values in the R matrix. It is probably due to memory allocation but we cannot figure out how to fix it. Normally R should be exclusively composed of the init value.
program = cl.Program(context, """
__kernel void generate_paths(__global float *P, ushort const n,
ushort N, ushort init, __global float *R){
int i = get_global_id(0);
__private float* Pbis;
for (int k=0; k<n; k++){
Pbis[k] = 0;
}
for (int j=0; j<n; j++)
{
R[i*(n+1) + j] = init;
}
R[i*(n+1) + n] = init;
}
""").build()
The parameters for the generation are:
program.generate_paths(queue, res_np.shape, None, P_buf, np.uint16(n), np.uint16(N), np.uint16(init), res_buf)
Here is the entire code for reproducibility:
import numpy as np
import pyopencl as cl
import numpy.linalg as la
import os
os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
os.environ['PYOPENCL_CTX'] = '1'
(n, N) = (3,6)
U = np.random.uniform(0,1, size=(n+1)*N)
U = U.astype(np.float32)
P = np.matrix([[0, 1/3, 1/3, 1/3], [1/3, 0, 1/3, 1/3], [1/3, 1/3, 0, 1/3], [1/3, 1/3, 1/3, 0]])
P = P.astype(np.float32)
res_np = np.zeros((N, n+1),dtype = np.float32)
platform = cl.get_platforms()[0]
device = platform.get_devices()[0]
context = cl.Context([device])
queue = cl.CommandQueue(context)
mf = cl.mem_flags
U_buf = cl.Buffer(context, mf.COPY_HOST_PTR | mf.COPY_HOST_PTR, hostbuf=U)
P_buf = cl.Buffer(context, mf.COPY_HOST_PTR | mf.COPY_HOST_PTR, hostbuf=P)
res_buf = cl.Buffer(context, mf.WRITE_ONLY, res_np.nbytes)
init = 0
program = cl.Program(context, """
__kernel void generate_paths(__global const float *U, __global float *P, ushort const n,
ushort N, ushort init, __global float *R){
int i = get_global_id(0);
int current = init;
__private float* Pbis;
for (int k=0; k<n; k++){
Pbis[k] = 0;
}
for (int j=0; j<n; j++)
{
R[i*(n+1) + j] = current;
}
R[i*(n+1) + n] = init;
}
""").build()
#prg.multiply(queue, c.shape, None,
# np.uint16(n), np.uint16(m), np.uint16(p),
# a_buf, b_buf, c_buf)
# a_mul_b = np.empty_like(c)
# cl.enqueue_copy(queue, a_mul_b, c_buf)
program.generate_paths(queue, res_np.shape, None, U_buf, P_buf, np.uint16(n), np.uint16(N), np.uint16(init), res_buf)
chem_gen = np.empty_like(res_np)
cl.enqueue_copy(queue, chem_gen, res_buf)
print("Platform Selected = %s" %platform.name)
print("Device Selected = %s" %device.name)
print("Generated Paths:")
print (chem_gen)

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Parallel QuickSort, can someone help me? - python-3.x

Related

Calculating number of minimum swaps to sort array (selection sort is too slow) [duplicate]

Error: TypeError: cannot perform reduce with flexible type

Mermory issue by using a for loop in R of C++ function using Rcpp

glUseProgram gives Invalid Operation error when trying to use a Compute Shader

PyOpenCL how to modify a matrix locally within the kernel function

Categories

Resources