Python Way - Processing multiple files & add the sum of all the results in Parallel - python-3.x

I have a list of 500 json files. Contents of the files are as follows
{'minute': '2022-11-16T02:29:00.000+00:00', 'mycount': [[0, 0], [1, 32], [2, 3456], [3, 446], [4, 534534], [5, 474], [6, 448], [7, 529], [8, 507], [9, 515], [10, 477], [11, 486], [12, 491], [13, 474], [14, 528], [15, 23]]}
I want to achieve the following using parallel processing ( may be processing 100 files in parallel)
For each file find the sum of second element of each element of mycount ( 0 + 32 +3456 +446+534534..]. Lets call it sum1
calculate sum1 for all the files and return total sum = sum1 + sum2+ sum3...
How can I achieve this using mutithreading in python?

If you don't mind of using multiprocessing instead of multithreading, you can adopt multiprocessing library and json decoder to parse the content of your files:
import multiprocessing as mp
import json
# Other libraries
import os
import warnings
def compute_file_sum(f):
"""Compute the sum for a file"""
try:
# Read the whole content
with open(f, 'r') as ff:
file_content = ff.readlines()
# Load as a JSON (mind the change of ' into ")
file_content = json.loads('\n'.join(file_content).replace("'", '"'))
# Compute the sum of second items of each element in 'mycount'
return sum(
c[1] for c in file_content['mycount']
)
except Exception as e:
# Handle exceptions
warnings.warn(f"Issues with file {f}, {str(e)}")
return 0
def get_filepaths(root_dir):
"""Get an iterator with the paths of the files of interest"""
return map(
lambda y: os.path.join(root_dir, y),
filter(
# Filter only files whose names match some conditions
lambda x: os.path.splitext(x)[0].startswith('bbb') and os.path.splitext(x)[1] == '.txt',
next(os.walk(root_dir))[2]
)
)
if __name__ == '__main__':
# Get the path of the folder with the files of interest
# Here is the folder with the python script
root_dir = os.path.dirname(__file__)
# Compute in parallel the sum for each file
with mp.Pool(processes=mp.cpu_count() - 1) as a:
file_sums = a.imap_unordered(compute_file_sum, get_filepaths(root_dir))
# Get the total sum
total_sum = sum(file_sums)

Related

select a part of dataframe every time in parallel

I want to create dictionaries in a loop.
Since, in every iteration I am taking only a part of the initial dataframe ( df_train = df[df['CLASS'] == oneClass]) , I want to make it parallel.
My code is:
import pandas as pd
import numpy as np
from multiprocessing import Pool
df = pd.DataFrame({'a':[0,1,2], 'b':[3, 4, 5], 'c': [6, 7, 8], 'CLASS':['A', 'B', 'C']})
def make_dataframes(df, oneClass):
new_df = {}
df_train = df[df['CLASS'] == oneClass]
numeric_only_data_cols = df_train.select_dtypes(include=np.number).columns.difference(['CLASS'])
numeric_only_data = df_train[numeric_only_data_cols]
X = numeric_only_data.values
x = X * 100
orig_columns = numeric_only_data.loc[:,
numeric_only_data.columns!='CLASS'].columns
new_df[oneClass] = pd.DataFrame(x, columns=orig_columns)
new_df[oneClass]['CLASS'] = df_train['CLASS']
return new_df
new_df = {}
classes = np.unique(df['CLASS'])
with Pool(4) as pool:
for new_dataframe in pool.map(make_dataframes, classes):
new_df['new_dataframe'] = new_dataframe
pool.close()
pool.join()
I omitted the for loop in the function:
new_df = {}
for oneClass in classes:
df_train = df[df['GROUP_DESC'] == oneClass]
...
Now, I am receiving:
make_dataframes() missing 1 required positional argument: 'oneClass'
I am not sure how to place the arguments of the function and if the classes is a valid argument for map.
Are you planning on executing your code inside a cluster? If not, then you're probably better off executing your code in the old single process fashioned way. There's this great talk on the subject by Raymond Hettinger that I find pretty interesting, and I recommend checking out: Raymond Hettinger, Keynote on Concurrency, PyBay 2017.
Having said that, one easy fix to your implementation would be to define a single parameter as input to make_dataframes, that represents a tuple of both df, and oneClass:
import pandas as pd
import numpy as np
from multiprocessing import Pool
def make_dataframes(args):
new_df = {}
df = args[0] # <--- Unpacking values
oneClass = args[-1] # <--- Unpacking values
df_train = df[df['CLASS'] == oneClass]
numeric_only_data = df_train.select_dtypes(include=np.number).loc[:, lambda xdf: xdf.columns.difference(['CLASS'])]
X = numeric_only_data.values
x = X * 100
orig_columns = numeric_only_data.loc[:, numeric_only_data.columns != 'CLASS'].columns
new_df[oneClass] = pd.DataFrame(x, columns=orig_columns)
new_df[oneClass]['CLASS'] = df_train['CLASS']
return new_df
df = pd.DataFrame({'a':[0,1,2], 'b':[3, 4, 5], 'c': [6, 7, 8], 'CLASS':['A', 'B', 'C']})
new_df = {}
classes = np.unique(df["CLASS"])
with Pool(4) as pool:
for new_dataframe in pool.map(make_dataframes, zip([df]*len(classes), classes)):
new_df[list(new_dataframe.keys())[0]] = list(new_dataframe.values())[0]
pool.close()
pool.join()
A second approach would be to use the Joblib package instead of multiprocessing, like so:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
def make_dataframes(df, oneClass):
new_df = {}
df_train = df[df["CLASS"] == oneClass]
numeric_only_data = df_train.select_dtypes(include=np.number).loc[
:, lambda xdf: xdf.columns.difference(["CLASS"])
]
X = numeric_only_data.values
x = X * 100
orig_columns = numeric_only_data.loc[
:, numeric_only_data.columns != "CLASS"
].columns
new_df[oneClass] = pd.DataFrame(x, columns=orig_columns)
new_df[oneClass]["CLASS"] = df_train["CLASS"]
return new_df
df = pd.DataFrame({'a':[0,1,2], 'b':[3, 4, 5], 'c': [6, 7, 8], 'CLASS':['A', 'B', 'C']})
classes = np.unique(df["CLASS"])
new_df = {
key: value
for parallel in Parallel(n_jobs=4)(
delayed(make_dataframes)(df, i) for i in classes
)
for key, value in parallel.items()
}
Finally, the approach I recommend using, if you're not planning on running this code inside a power-hungry cluster, and need to extract all the juice you can get from it:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
def make_dataframes(df, oneClass):
new_df = {}
df_train = df[df["CLASS"] == oneClass]
numeric_only_data = df_train.select_dtypes(include=np.number).loc[
:, lambda xdf: xdf.columns.difference(["CLASS"])
]
X = numeric_only_data.values
x = X * 100
orig_columns = numeric_only_data.loc[
:, numeric_only_data.columns != "CLASS"
].columns
new_df[oneClass] = pd.DataFrame(x, columns=orig_columns)
new_df[oneClass]["CLASS"] = df_train["CLASS"]
return new_df
df = pd.DataFrame({'a':[0,1,2], 'b':[3, 4, 5], 'c': [6, 7, 8], 'CLASS':['A', 'B', 'C']})
classes = np.unique(df["CLASS"])
new_df = {c: make_dataframes(df, c)[c] for c in classes}
For comparison, I've recorded each approach execution time:
multiprocessing: CPU times: user 13.6 ms, sys: 41.1 ms, total: 54.7 ms Wall time: 158 ms
joblib: CPU times: user 14.3 ms, sys: 0 ns, total: 14.3 ms Wall time: 16.5 ms
Serial processing: CPU times: user 14.1 ms, sys: 797 µs, total: 14.9 ms Wall time: 14.9 ms
Running things in parallel has a lot of overhead communication costs between the different processing nodes. Besides it's an intrinsically more complex task to do, then to run things serially. Consequently, developing and maintaining the code becomes exponentially harder and expensive. If running things in parallel is number 1 priority, I would recommend first ditching Pandas, and using PySpark, or Dask instead.

Roll of different amount along a single axis in a 3D matrix [duplicate]

I have a matrix (2d numpy ndarray, to be precise):
A = np.array([[4, 0, 0],
[1, 2, 3],
[0, 0, 5]])
And I want to roll each row of A independently, according to roll values in another array:
r = np.array([2, 0, -1])
That is, I want to do this:
print np.array([np.roll(row, x) for row,x in zip(A, r)])
[[0 0 4]
[1 2 3]
[0 5 0]]
Is there a way to do this efficiently? Perhaps using fancy indexing tricks?
Sure you can do it using advanced indexing, whether it is the fastest way probably depends on your array size (if your rows are large it may not be):
rows, column_indices = np.ogrid[:A.shape[0], :A.shape[1]]
# Use always a negative shift, so that column_indices are valid.
# (could also use module operation)
r[r < 0] += A.shape[1]
column_indices = column_indices - r[:, np.newaxis]
result = A[rows, column_indices]
numpy.lib.stride_tricks.as_strided stricks (abbrev pun intended) again!
Speaking of fancy indexing tricks, there's the infamous - np.lib.stride_tricks.as_strided. The idea/trick would be to get a sliced portion starting from the first column until the second last one and concatenate at the end. This ensures that we can stride in the forward direction as needed to leverage np.lib.stride_tricks.as_strided and thus avoid the need of actually rolling back. That's the whole idea!
Now, in terms of actual implementation we would use scikit-image's view_as_windows to elegantly use np.lib.stride_tricks.as_strided under the hoods. Thus, the final implementation would be -
from skimage.util.shape import view_as_windows as viewW
def strided_indexing_roll(a, r):
# Concatenate with sliced to cover all rolls
a_ext = np.concatenate((a,a[:,:-1]),axis=1)
# Get sliding windows; use advanced-indexing to select appropriate ones
n = a.shape[1]
return viewW(a_ext,(1,n))[np.arange(len(r)), (n-r)%n,0]
Here's a sample run -
In [327]: A = np.array([[4, 0, 0],
...: [1, 2, 3],
...: [0, 0, 5]])
In [328]: r = np.array([2, 0, -1])
In [329]: strided_indexing_roll(A, r)
Out[329]:
array([[0, 0, 4],
[1, 2, 3],
[0, 5, 0]])
Benchmarking
# #seberg's solution
def advindexing_roll(A, r):
rows, column_indices = np.ogrid[:A.shape[0], :A.shape[1]]
r[r < 0] += A.shape[1]
column_indices = column_indices - r[:,np.newaxis]
return A[rows, column_indices]
Let's do some benchmarking on an array with large number of rows and columns -
In [324]: np.random.seed(0)
...: a = np.random.rand(10000,1000)
...: r = np.random.randint(-1000,1000,(10000))
# #seberg's solution
In [325]: %timeit advindexing_roll(a, r)
10 loops, best of 3: 71.3 ms per loop
# Solution from this post
In [326]: %timeit strided_indexing_roll(a, r)
10 loops, best of 3: 44 ms per loop
In case you want more general solution (dealing with any shape and with any axis), I modified #seberg's solution:
def indep_roll(arr, shifts, axis=1):
"""Apply an independent roll for each dimensions of a single axis.
Parameters
----------
arr : np.ndarray
Array of any shape.
shifts : np.ndarray
How many shifting to use for each dimension. Shape: `(arr.shape[axis],)`.
axis : int
Axis along which elements are shifted.
"""
arr = np.swapaxes(arr,axis,-1)
all_idcs = np.ogrid[[slice(0,n) for n in arr.shape]]
# Convert to a positive shift
shifts[shifts < 0] += arr.shape[-1]
all_idcs[-1] = all_idcs[-1] - shifts[:, np.newaxis]
result = arr[tuple(all_idcs)]
arr = np.swapaxes(result,-1,axis)
return arr
I implement a pure numpy.lib.stride_tricks.as_strided solution as follows
from numpy.lib.stride_tricks import as_strided
def custom_roll(arr, r_tup):
m = np.asarray(r_tup)
arr_roll = arr[:, [*range(arr.shape[1]),*range(arr.shape[1]-1)]].copy() #need `copy`
strd_0, strd_1 = arr_roll.strides
n = arr.shape[1]
result = as_strided(arr_roll, (*arr.shape, n), (strd_0 ,strd_1, strd_1))
return result[np.arange(arr.shape[0]), (n-m)%n]
A = np.array([[4, 0, 0],
[1, 2, 3],
[0, 0, 5]])
r = np.array([2, 0, -1])
out = custom_roll(A, r)
Out[789]:
array([[0, 0, 4],
[1, 2, 3],
[0, 5, 0]])
By using a fast fourrier transform we can apply a transformation in the frequency domain and then use the inverse fast fourrier transform to obtain the row shift.
So this is a pure numpy solution that take only one line:
import numpy as np
from numpy.fft import fft, ifft
# The row shift function using the fast fourrier transform
# rshift(A,r) where A is a 2D array, r the row shift vector
def rshift(A,r):
return np.real(ifft(fft(A,axis=1)*np.exp(2*1j*np.pi/A.shape[1]*r[:,None]*np.r_[0:A.shape[1]][None,:]),axis=1).round())
This will apply a left shift, but we can simply negate the exponential exponant to turn the function into a right shift function:
ifft(fft(...)*np.exp(-2*1j...)
It can be used like that:
# Example:
A = np.array([[1,2,3,4],
[1,2,3,4],
[1,2,3,4]])
r = np.array([1,-1,3])
print(rshift(A,r))
Building on divakar's excellent answer, you can apply this logic to 3D array easily (which was the problematic that brought me here in the first place). Here's an example - basically flatten your data, roll it & reshape it after::
def applyroll_30(cube, threshold=25, offset=500):
flattened_cube = cube.copy().reshape(cube.shape[0]*cube.shape[1], cube.shape[2])
roll_matrix = calc_roll_matrix_flattened(flattened_cube, threshold, offset)
rolled_cube = strided_indexing_roll(flattened_cube, roll_matrix, cube_shape=cube.shape)
rolled_cube = triggered_cube.reshape(cube.shape[0], cube.shape[1], cube.shape[2])
return rolled_cube
def calc_roll_matrix_flattened(cube_flattened, threshold, offset):
""" Calculates the number of position along time axis we need to shift
elements in order to trig the data.
We return a 1D numpy array of shape (X*Y, time) elements
"""
# armax(...) finds the position in the cube (3d) where we are above threshold
roll_matrix = np.argmax(cube_flattened > threshold, axis=1) + offset
# ensure we don't have index out of bound
roll_matrix[roll_matrix>cube_flattened.shape[1]] = cube_flattened.shape[1]
return roll_matrix
def strided_indexing_roll(cube_flattened, roll_matrix_flattened, cube_shape):
# Concatenate with sliced to cover all rolls
# otherwise we shift in the wrong direction for my application
roll_matrix_flattened = -1 * roll_matrix_flattened
a_ext = np.concatenate((cube_flattened, cube_flattened[:, :-1]), axis=1)
# Get sliding windows; use advanced-indexing to select appropriate ones
n = cube_flattened.shape[1]
result = viewW(a_ext,(1,n))[np.arange(len(roll_matrix_flattened)), (n - roll_matrix_flattened) % n, 0]
result = result.reshape(cube_shape)
return result
Divakar's answer doesn't do justice to how much more efficient this is on large cube of data. I've timed it on a 400x400x2000 data formatted as int8. An equivalent for-loop does ~5.5seconds, Seberg's answer ~3.0seconds and strided_indexing.... ~0.5second.

Task : Find unique elements in an array. Count their occurrences. Find the numbers that occur less than 10 times in an array of 5000 elements

I tried a few solutions :
1.)
uniqueValues, indexList,occurCount = np.unique(desired_array,
return_index=True, return_counts=True)
print(uniqueValues,indexList,occurCount)
However the indexList only gives first occurrence of a number. For example : if num 33 occurred at 20,56,3000, indexList would only show that it occurred at 20. Since 33 occurs less than 10 times, i.e 3 times, I need all the locations.
2.) I decided to use dictionary to find all the index locations. But this is not working.
for i in range(5000):
...: if not d.get(i):
...: d[desired_array[i]]=[i]
...: else:
...: indices = d[desired_array[i]]
...: indices.append(i)
This jobs screams for collections.Counter:
from collections import Counter
desired_array = [1, 2, 3, 1, 3, 5, 3]
result = Counter(desired_array)
print(result)
This will print out the unique elements and the count of occurrences:
Counter({3: 3, 1: 2, 2: 1, 5: 1})
You can replace
for i in range(1250):
var = desired_array[i]
if not d.get(var):
d[var] = []
# print(var)
s = d[var]
s.append(i)
with
for i in range(1250):
var = desired_array[i]
d.setdefault(var, []).append(i)
According to the documentation dict.setdefault(key, default):
If key is in the dictionary, return its value. If not, insert key with a value of default and return default. default defaults to None.
To write a csv file it's best to use the standard csv.Writer class:
import csv
with open('some.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(someiterable)
If you want to write the key/value pairs of your dict to the csv file you need to write something like:
with open('some.csv', 'w', newline='') as f:
writer = csv.writer(f)
for k, v in desired_array.items():
writer.writerow((k, v))

Creating different dataframe and outputting it to different csv based on list of indexes

I have a list of indexes like below based on N value. Here is the code I used to create the list of indexes
df = pd.DataFrame(np.arange(100).reshape((-1, 5)))
N = 4
ix = [[i, i+N] for i in range(0,len(df),N)]
ix
# [[0, 4], [4, 8], [8, 12], [12, 16], [16, 20]]
I want to create function which creates:
1) N dataframes (df_1, df_2, df_3, df_4, df_5). The rows in each dataframes is based on each list of indexes. For example, "df_1" will have all the rows between index 0 and 4 from the main dataframe df and similarly df_2 will have all the rows between index 4 and 8 from dataframe df
2) outputs each dataframes to csv as df_1.csv, df_2.csv ....
Below is the code I tried but "df_i = df.ix[i]" step only gets the row in the list not the range in the list :
def write(df, ix):
for i in ix:
try:
df_i = df.ix[i]
df_i.to_csv("a.csv", index = false)
except:
pass
You can use iloc
def write(df, ix):
c = 1
for i in ix:
try:
df_i = df.iloc[i[0]:i[1]] # use iloc
df_i.to_csv(f"df_{str(c)}.csv", index=False) # f-strings to name file
c+=1 # update your counter
except:
pass
df = pd.DataFrame(np.arange(100).reshape((-1, 5)))
N = 5
ix = [(i, i+N) for i in range(0,len(df),N)]
write(df, ix)

How to replicate Pandas syntax? (To filter data frames)

How do I implement the syntax for filtering dataframes in Pandas? (df[df.column1 > someValue])
I am trying to make a class that have the same syntax of Pandas when filtering dataframes.
How do I replicate the syntax for a Dataframe df = DataFrame(someData) like this one:
df[df.column1 > someValue]
I implemented the methods __getattr__ and __getitem__ for the syntaxes of
df.column1
df['column1']
But I don't know how to link both together. Also, I could not find the function to copy from Pandas code.
Either an implementation to this problem or the reference to the function in Pandas would be of great help.
Edit:(Solution)
Following the hint on the answers I implemented the __getitem__ function as follows:
from tier tools import compress
def __getitem__(self, name):
"""Get items with [ and ]
"""
#If there is no expression, return a column
if isinstance(name, str):
return self.data[name]
#if there was an expression return the dataframe filtered
elif isinstance(name, list):
ind = list(compress(range(len(name)), name))
temp = DataFrame([[self.data[c].values[i]
for i in ind]
for c in self.columns],
columns=self.columns)
return temp
Note that I also had to implement the comparison methods for my column class (Series).
The full code can be seen here.
You need to implement __getitem__ to take a list of booleans and only return items when True. You will also need to implement the conditional operators (>, ==, etc.) to return that list of booleans, e.g. (proof of concept code):
class A(object):
def __init__(self, data):
self.data = data
def __getitem__(self, key):
return A([d for k, d in zip(key, self.data) if k])
def __gt__(self, value):
return [d > value for d in self.data]
def __repr__(self):
return str(self.__class__) + ' [' + ', '.join(str(d) for d in self.data) + ']'
>>> a = A(list(range(20)))
>>> a
<class '__main__.A'> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
>>> a[a > 5]
<class '__main__.A'> [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
I think you basically want something that just wraps a recarray or structured array.
import numpy as np
myarray = np.array([("Hello",2.5,3),
("World",3.6,2),
('Foobar',2,7)]).T
df = np.core.records.fromarrays(myarray,
names='column1, column2, column3',
formats = 'S8, f8, i8')
print(df)
print(df[df.column3<=3])
While I don't use Pandas myself, the DataFrame seems like it is very similar to a recarray. If you wanted to roll your own, be sure to read about subclassing ndarray. numpy arrays can also be indexed with boolean mask variables such as
myarray = np.array([(1,2.5,3.),
(2,3.6,2.),
(3,2,7.)])
print(myarray[myarray[:,2]<=3.])

Resources