Multiprocessing/for loop is skipping elements randomly - python-3.x

The dataset has billions of data points for each pair. I tried the multiprocessing loop to make it faster.
Why multiprocessing/for loop is skipping some elements from the Pairs?
Once I run again, this skips some other names randomly and the code ends.
import pandas as pd
import pickle
import time
import concurrent.futures
start = time.perf_counter()
pairs = ['GBPUSD', 'AUDUSD', 'EURUSD', 'EURJPY', 'GBPJPY', 'USDJPY', 'USDCAD', 'EURGBP']
def pickling_joined(p):
df = pd.read_csv(f'C:\\Users\\Ghosh\\Downloads\\dataset\\data_joined\\{p}.csv')
df['LTP'] = (df['Bid'] + df['Ask']) / 2
print(f'\n=====>> Converting Date format for {p} ....')
df['Date'] = df['Date'].apply(pd.to_datetime)
print(f'\n=====>> Date format converted for {p} ....')
df.set_index('Date', inplace=True)
df = pd.DataFrame(df)
with open(f'C:\\Users\\Ghosh\\Downloads\\dataset\\data_pickled\\{p}.pkl', 'wb') as pickle_file:
pickle.dump(df, pickle_file)
print(f'\n=====>> Pickling done for {p} !!!')
if __name__ == '__main__':
with concurrent.futures.ProcessPoolExecutor() as executor:
executor.map(pickling_joined, pairs)
finish = time.perf_counter()
print(f'Finished in {finish - start} seconds')

Python doesn't handle Thread/Multiprocessing well with heavy files, I would recommend DASK here. DASK uses clustering which works like multiprocessing which takes lesser time, and then you can use multiprocessing, in addition, to run it faster.
def pickling_joined(p):
df = dd.read_csv(f'C:\\Users\\Ghosh\\Downloads\\dataset\\data_joined\\{p}.csv')
df['LTP'] = (df['Bid'] + df['Ask']) / 2
print(f'\n=====>> Converting Date format for {p} ....')
df['Date'] = dd.to_datetime(df.Date)
print(f'\n=====>> Date format converted for {p} ....')
df = df.set_index('Date', sorted=True)
df = df.compute()
with open(f'C:\\Users\\Ghosh\\Downloads\\dataset\\data_pickled\\{p}.pkl', 'wb') as pickle_file:
pickle.dump(df, pickle_file)
print(f'\n=O=O=O=O=O>> Pickling done for {p} !!!')
if __name__ == '__main__':
with concurrent.futures.ProcessPoolExecutor() as executor:
executor.map(pickling_joined, pairs)
finish = time.perf_counter()
print(f'\nFinished in {finish - start} seconds')

Java will be a better choice for such action, python will always skip steps if large DF.

Related

csv_reader read N lines at a time

I have to read a CSV file N lines at a time.
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
for row in csv_reader:
print row
I know I can loop N times at a time, build a list of list and process that way.
But is there a simpler way of using csv_reader so that I read n lines at a time.
Hi I don't think that you'll be able to do that without a loop with csv package.
You should use pandas (pip install --user pandas) instead:
import pandas
df = pandas.read_csv('myfile.csv')
start = 0
step = 2 # Your 'N'
for i in range(0, len(df), step):
print(df[i:i+step])
start = i
Pandas has a chunksize option to their read_csv() method and I would probably explore that option.
If I was going to do it myself by hand, I would probably do something like:
import csv
def process_batch(rows):
print(rows)
def get_batch(reader, batch_size):
return [row for _ in range(batch_size) if (row:=next(reader, None))]
with open("data.csv", "r") as file_in:
reader = csv.reader(file_in)
while batch := get_batch(reader, 5):
process_batch(batch)

Automatically Extracting the Datetime Format from a Pandas Series [duplicate]

I am trying to format the column 'Data' to make a pattern with dates.
The formats I have are:
1/30/20 16:00
1/31/2020 23:59
2020-02-02T23:43:02
Here is the code for the dataframe.
import requests
import pandas as pd
import numpy as np
url = "https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports"
csv_only = [i.split("=")[1][1:-1] for i in requests.get(url).text.split(" ") if '.csv' in i and 'title' in i]
combo = [pd.read_csv(url.replace("github","raw.githubusercontent").replace("/tree/","/")+"/"+f) for f in csv_only]
one_df = pd.concat(combo,ignore_index=True)
one_df["País"] = one_df["Country/Region"].fillna(one_df["Country_Region"])
one_df["Data"] = one_df["Last Update"].fillna(one_df["Last_Update"])
I tried adding the code bellow but it doesn't bring the result I wanted
pd.to_datetime(one_df['Data'])
one_df.style.format({"Data": lambda t: t.strftime("%m/%d/%Y")})
Any help?
UPDATE
This is the complete code, but it doesn't work. Many exceptions printed with different date formats.
import requests
import pandas as pd
import numpy as np
from datetime import datetime
url = "https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports"
csv_only = [i.split("=")[1][1:-1] for i in requests.get(url).text.split(" ") if '.csv' in i and 'title' in i]
combo = [pd.read_csv(url.replace("github","raw.githubusercontent").replace("/tree/","/")+"/"+f) for f in csv_only]
one_df = pd.concat(combo,ignore_index=True)
df = pd.DataFrame()
DATE_FORMATS = ["%m/%d/%y %H:%M", "%m/%d/%Y %H:%M", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S"]
df["Região"] = one_df["Province/State"].fillna(one_df["Admin2"])
df["País"] = one_df["Country/Region"].fillna(one_df["Country_Region"])
df["Data"] = one_df["Last Update"].fillna(one_df["Last_Update"])
df["Confirmados"] = one_df["Confirmed"]
df["Mortes"] = one_df["Deaths"]
df["Recuperados"] = one_df["Recovered"]
def parse(x_):
for fmt in DATE_FORMATS :
try:
tmp = datetime.strptime(x_, fmt).strftime("%m/%d/%Y")
return tmp
except ValueError:
print(x_)
pd.to_datetime(df['Data'])
df['Data'] = df['Data'].apply(lambda x: parse(x))
#df['Data'].strftime('%m/%d/%Y')
#df['Data'] = df['Data'].map(lambda x: x.strftime('%m/%d/%Y') if x else '')
df.to_excel(r'C:\Users\guilh\Downloads\Covid2\Covid-19.xlsx', index=False, encoding="utf8")
print(df)
from datetime import datetime
import pandas as pd
You could save all possible formats in a list as -
DATE_FORMATS = ["%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", "%m/%d/%y %H:%M", "%m/%d/%Y %H:%M"]
Define a function that loops through the formats and tries to parse it.
(Fixed a bug, where the print statement should have been outside the for loop)
issues = set()
def parse(x_):
for fmt in DATE_FORMATS:
try:
return datetime.strptime(x_, fmt).strftime("%m/%d/%Y")
except ValueError:
pass
issues.add(x_)
sample = ["1/30/20 16:00", "1/31/2020 23:59", "2020-02-02T23:43:02"]
df = pd.DataFrame({'data': sample})
df['data'] = df['data'].apply(lambda x: parse(x))
assert df['Data'].isna().sum() == len(issues) == 0, "Issues observed, nulls observed in dataframe"
print("Done")
Output
data
0 01/30/2020
1 01/31/2020
2 02/02/2020
If df.apply() comes across a particular date format that hasn't been defined in the list, it would simply print None since nothing would be returned by the function parse()
also here, letting pd.to_datetime infer the format does the trick:
import pandas as pd
s = pd.to_datetime(["1/30/20 16:00", "1/31/2020 23:59", "2020-02-02T23:43:02"])
print(s)
# DatetimeIndex(['2020-01-30 16:00:00', '2020-01-31 23:59:00',
# '2020-02-02 23:43:02'],
# dtype='datetime64[ns]', freq=None)
Note that if your date/time format generally provides the day first (e.g. 30.1.2021 for Jan 30th 2021), set keyword dayfirst=True.

Can I using multiple processes to read different subsets of numpy array (or pandas dataframe) safely?

I want to use multiple processes to get each 2 columns combination in numpy array (or pandas dataframe), such as array[:, 1:3], array[:, 2:4].
I wonder is it safe to get array[:, 1:3] in one process and get array[:, 2:4] in another process?
The example code is shown:
import time
import numpy as np
import pandas as pd
from itertools import combinations
from multiprocessing import Pool, Value, Lock, Array
g = np.load('input.npy')
c = Value('i', 0, lock=True)
def count_valid_pairs(i):
pair = g[:, i]
global c
if pair.max() > 100:
with c.get_lock():
c.value += 1
return
if __name__ == '__main__':
t_start = time.time()
cpus = 20
p = Pool(processes=cpus)
r=p.imap_unordered(count_valid_pairs, combinations(range(g.shape[1]), 2))
p.close()
p.join()
print("Total {} pairs has max value > 100".format(c.value)

Get python to add serial nos to each entry as it is run

I am new to programming and probably there is an answer to my question somewhere like here, the closest i found after searching for days. Most of the info deals with existing csvs or hardcoding data. I am trying to make the program create data every time it runs and work on that so a little stumped here.
The Problem:
I can't seem to get python to attach serial nos to each entry when i run the program am making to log my study blocks. It has various fields following are two of them:
Date Time
12-03-2018 11:30
Following is the code snippet:
d= ''
while d == '':
d = input('Date:')
try:
valid_date = dt.strptime(d, '%Y-%m-%d')
except ValueError:
d = ''
print('Please input date in YYYY-MM-DD format.')
t= ''
while t == '':
t = input('Time:')
try:
valid_time = dt.strptime(t, '%H:%M')
except ValueError:
d = ''
print('Please input time in HH:MM format.')
header = csv.DictWriter(outfile, fieldnames= ['UID', 'Date', 'Time', 'Topic', 'Objective', 'Why', 'Summary'], delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL )
header.writeheader()
log_input = csv.writer(outfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
log_input.writerow([d, t, topic, objective, why, summary])
outfile.close()
df = pd.read_csv('E:\Coursera\HSU\python\pom_blocks_log.csv')
df = pd.read_csv('E:\pom_blocks_log.csv')
df = df.reset_index()
df.columns[0] = 'UID'
df['UID'] = df.index
print (df)
I get the following error when i run the program with the df block:
TypeError: Index does not support mutable operations
I new to python and don't really know how to work with data structures, so i am building small programs to learn. Any help is highly appreciated and apologies if this is a duplicate, please point me to the right direction.
So, i figured it out. Following is the process i followed:
I save the CSV file using the csv module.
I load the CSV file in pandas as dataframe.
What this does is, it allows me to append user entries to the CSV every time the program is run and then i can load it as a dataframe and use pandas to manipulate the data accordingly. Then i added a generator to clean the lines off the delimiter character ',' so that it could be loaded as a dataframe for string columns where ',' is accepted as a valid input. Maybe this is a round about approach but, it works.
Following is the code:
import csv
from csv import reader
from datetime import datetime
import pandas as pd
import numpy as np
with open(r'E:\Coursera\HSU\08_programming\trLog_df.csv','a', encoding='utf-8') as csvfile:
# Date
d = ''#input("Date:")
while d == '':
d = input('Date: ')
try:
valid_date = datetime.strptime(d, '%Y-%m-%d')
except ValueError:
d = ''
print("Incorrect data format, should be YYYY-MM-DD")
# Time
t = ''#input("Date:")
while t == '':
t = input('Time: ')
try:
valid_date = datetime.strptime(t, '%H:%M')
except ValueError:
t = ''
print("Incorrect data format, should be HH:MM")
log_input = csv.writer(csvfile, delimiter= ',',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
log_input.writerow([d, t])
# Function to clean lines off the delimter ','
def merge_last(file_name, merge_after_col=7, skip_lines=0):
with open(file_name, 'r') as fp:
for i, line in enumerate(fp):
if i < 2:
continue
spl = line.strip().split(',')
yield (*spl[:merge_after_col], ','.join(spl[merge_after_col:2]))
# Generator to clean the lines
gen = merge_last(r'E:\Coursera\HSU\08_programming\trLog_df.csv', 1)
# get the column names
header = next(gen)
# create the data frame
df = pd.DataFrame(gen, columns=header)
df.head()
print(df)
If anybody has a better solution, it would be enlightening to know how to do it with efficiency and elegance.
Thank you for reading.

Python Multiprocessing throwing out results based on previous values

I am trying to learn how to use multiprocessing and have managed to get the code below to work. The goal is to work through every combination of the variables within the CostlyFunction by setting n equal to some number (right now it is 100 so the first 100 combinations are tested). I was hoping I could manipulate w as each process returned its list (CostlyFunction returns a list of 7 values) and only keep the results in a given range. Right now, w holds all 100 lists and then lets me manipulate those lists but, when I use n=10MM, w becomes huge and costly to hold. Is there a way to evaluate CostlyFunction's output as the workers return values and then 'throw out' values I don't need?
if __name__ == "__main__":
import csv
csvFile = open('C:\\Users\\bryan.j.weiner\\Desktop\\test.csv', 'w', newline='')
#width = -36000000/1000
#fronteir = [None]*1000
currtime = time()
n=100
po = Pool()
res = po.map_async(CostlyFunction,((i,) for i in range(n)))
w = res.get()
spamwriter = csv.writer(csvFile, delimiter=',')
spamwriter.writerows(w)
print(('2: parallel: time elapsed:', time() - currtime))
csvFile.close()
Unfortunately, Pool doesn't have a 'filter' method; otherwise, you might've been able to prune your results before they're returned. Pool.imap is probably the best solution you'll find for dealing with your memory issue: it returns an iterator over the results from CostlyFunction.
For sorting through the results, I made a simple list-based class called TopList that stores a fixed number of items. All of its items are the highest-ranked according to a key function.
from collections import Userlist
def keyfunc(a):
return a[5] # This would be the sixth item in a result from CostlyFunction
class TopList(UserList):
def __init__(self, key, *args, cap=10): # cap is the largest number of results
super().__init__(*args) # you want to store
self.cap = cap
self.key = key
def add(self, item):
self.data.append(item)
self.data.sort(key=self.key, reverse=True)
self.data.pop()
Here's how your code might look:
if __name__ == "__main__":
import csv
csvFile = open('C:\\Users\\bryan.j.weiner\\Desktop\\test.csv', 'w', newline='')
n = 100
currtime = time()
po = Pool()
best = TopList(keyfunc)
result_iter = po.imap(CostlyFunction, ((i,) for i in range(n)))
for result in result_iter:
best.add(result)
spamwriter = csv.writer(csvFile, delimiter=',')
spamwriter.writerows(w)
print(('2: parallel: time elapsed:', time() - currtime))
csvFile.close()

Resources