Fastest iteration on a dataframe by applying an url function - python-3.x

I need to request some data from an url by inserting a variable=var for each row of my dataframe. I wrote a function that iterates over each row
def df_eval(data):
data_eval = data.copy()
df_price = []
for i in data_eval.index:
var = data_eval.at[i, 'var']
url = ("http://blablabla/params&cid={}".format(var))
r_json = requests.get(url).json()
df = json_normalize(r_json)
df_price.append(df['price'])
print(df_price)
data_eval['price_eval'] = df_price
return data_eval
Could you be able to suggest a faster way for this operation. Currently it takes about 30 minutes over 23000 rows.

You could paralelize your calls like this:
import random
import pandas as pd
import numpy as np
from multiprocessing import Pool
data_split = np.array_split(data, n_cores)
pool = Pool(n_cores)
data = pd.concat(pool.map(df_eval, data_split))
pool.close()
pool.join()
Source: https://towardsdatascience.com/make-your-own-super-pandas-using-multiproc-1c04f41944a1

Related

loop over a python list

I have a python list of Ids which I am calling in my function. There are around 200 ids. I would like to know what will be the best way to call these ids in chunks like I call 10 or 20 ids at a time and in next call, I call the next 20 ids and so on.. I have used multithreading here to make it faster but it seems to take lot of time. Here is the code I managed:
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import datetime as dt
df = pd.ExcelFile('ids.xlsx').parse('Sheet1')
x=[]
x.append(df['external_ids'].to_list())
def download():
#client is my python sdk
dtest_df = client.datapoints.retrieve_dataframe(external_id = x[0], start=0, end="now",granularity='1m')
dtest_df = dtest_df.rename(columns = {'index':'timestamp'})
client.datapoints.insert_dataframe(dtest_df,external_id_headers = True,dropna = True)
print(dtest_df)
with ThreadPoolExecutor(max_workers=20) as executor:
future = executor.submit(download)
print(future.result())

Save scraping results one by one into Excel or CSV file in Python

I have a crawler code as follows:
import requests
import json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from datetime import datetime
def crawl(id):
try:
url = 'https://www.china0001.com.cn/project/{0:06d}.html'.format(id)
print(url)
content = requests.get(url).text
soup = BeautifulSoup(content, 'lxml')
tbody = soup.find("table", attrs={"id":"mse_new"}).find("tbody", attrs={"class":"jg"})
tr = tbody.find_all("tr")
rows = []
for i in tr[1:]:
rows.append([j.text.strip() for j in i.findAll("td")])
out = dict([map(str.strip, y.split(':')) for x in rows for y in x])
return out
except AttributeError:
return False
data = list()
for id in range(699998, 700010):
print(id)
res = crawl(id)
if res:
data.append(res)
if len(data) > 0:
df = pd.DataFrame(data)
df.to_excel('test.xlsx', index = False)
In this code, the result dataframe df will be writen to an Excel file after the whole scraping process is finished.
Now I want to save the scraping results one by one into Excel or CSV file during the scraping process, how could I modify the code above?
Thanks.
Updates:
MAX_WORKERS = 30
ids = range(700000, 700050)
workers = min(MAX_WORKERS, len(ids))
with futures.ThreadPoolExecutor(workers) as executor:
res = executor.map(crawl, sorted(ids))
data = list(res)
if len(data) > 0:
df = pd.DataFrame(data)
df.to_csv('test.csv', mode = 'a', header = True, index = False)
Try Using to_csv with header=False, index=False
Ex:
for id in range(699998, 700010):
res = crawl(id)
if res:
df = pd.DataFrame([res])
df.to_csv('test.csv', mode='a', header=False, index=False)
I’d recommend looking at my question on here:
What is the problem with the pandas to csv in my code?.
I’d recommend looking at the answers for the daily sheets then apply and modify it to fit your program

'For' loop for reading multiple csv files from a google storage bucket into 1 Pandas DataFrame

I currently have 31 .csv files (all with the same identical structure - 60 cols wide and about 5000 rows deep) that I'm trying to read in from a google storage bucket into 1 pandas dataframe using a 'FOR' loop and I keep getting a 'timeout' error after 6 mins.
Upon doing some testing, I have noticed that I'm able to read one .csv file a time through it, but once I introduce 2 or more, I get the timeout error. This makes me think that my code is the problem rather than the size of the data.
Code is below (Should I be using pd.concat at any stage in the for loop?) help would be appreciated
def stage1eposdata(data, context):
from google.cloud import storage
from google.cloud import bigquery
import pandas as pd
import dask.dataframe as dd
import io
import numpy as np
import datetime as dt
from googleapiclient import discovery
from pandas.io.json import json_normalize
import google.auth
import math
destination_path1 = 'gs://staged_data/ddf-*_stet.csv'
## Source Buckets #
raw_epos_bucket = 'raw_data'
cleaned_epos_bucket = 'staged_data'
# Confirming Oauth #
storage_client = storage.Client()
bigquery_client = bigquery.Client()
# Confirming Connection #
raw_epos_data = storage_client.bucket(raw_epos_bucket)
cleaned_epos_data = storage_client.bucket(cleaned_epos_bucket)
df = pd.DataFrame()
for file in list(raw_epos_data.list_blobs(prefix='2019/')):
file_path="gs://{}/{}".format(file.bucket.name, file.name)
df = df.append(pd.read_csv(file_path),sort =False)
ddf = dd.from_pandas(df,npartitions=1, sort=True)
ddf.to_csv(destination_path1, index=True, sep=',')
Try this:
## Source Buckets #
raw_epos_bucket = 'raw_data'
cleaned_epos_bucket = 'staged_data'
# Confirming Oauth #
storage_client = storage.Client()
bigquery_client = bigquery.Client()
# Confirming Connection #
raw_epos_data = storage_client.bucket(raw_epos_bucket)
cleaned_epos_data = storage_client.bucket(cleaned_epos_bucket)
my_dataframe_list=[]
for file in list(raw_epos_data.list_blobs(prefix='2019/')):
file_path="gs://{}/{}".format(file.bucket.name, file.name)
my_dataframe_list.append(pd.read_csv(file_path))
df=pd.concat(my_dataframe_list)
ddf = dd.from_pandas(df,npartitions=1, sort=True)
ddf.to_csv(destination_path1, index=True, sep=',')
pd.concat joins a list of DataFrame. So in each iteration of the loop you keep the dataframe in the list my_dataframe_list and out of the loop concatenate the list.
if the columns match it should work.
It turns out that dask can do this type of thing very well due to its 'lazy' computation feature. My solution is below
## Source Buckets #
raw_epos_bucket = 'raw_data'
cleaned_epos_bucket = 'staged_data'
# Confirming Oauth #
storage_client = storage.Client()
bigquery_client = bigquery.Client()
# Confirming Connection #
raw_epos_data = storage_client.bucket(raw_epos_bucket)
cleaned_epos_data = storage_client.bucket(cleaned_epos_bucket)
my_dataframe_list = []
my_dataframe_list = dd.read_csv('gs://raw_data/*.csv')# '*' is wild card no need to do any more 'For' Loops!
ddf = dd.from_pandas(df,npartitions=1, sort=True)
ddf.to_csv(destination_path1, index=True, sep=',')

Python: Facebook Graph API - batch request

I want to make a batch request getting campaigns for a specific ad account. I created a simple code based on this issue
but I've used some global arrays and I don't know if time.sleep(2) is necessary for this code. My code is as below:
from facebookads import FacebookAdsApi
from facebookads.api import FacebookRequest
import pandas as pd
import time
batch_body_responses = []
list_of_artists = [1]
def success_callback(response):
try:
pair = [response.json()['data']]
next = [response.json()['paging']['next']]
batch_body_responses.append(pair)
batch_body_responses.append(next)
except IndexError:
pass
except UnicodeEncodeError:
pass
def error_callback(response):
pass
def generate_batches(iterable, batch_size_limit):
# This function can be found in examples/batch_utils.py
batch = []
for item in iterable:
if len(batch) == batch_size_limit:
yield batch
batch = []
batch.append(item)
if len(batch):
yield batch
def get_id_list(art_search_list):
batches = []
your_app_id = '756885'
your_app_secret = '123456789'
your_access_token = 'EAA.....'
api = FacebookAdsApi.init(your_app_id, your_app_secret, your_access_token)
batch_limit = 25
for batch in generate_batches(art_search_list, batch_limit):
next_batch = api.new_batch()
for artt in batch:
requestss = [FacebookRequest(node_id='act_1234/campaigns',method="GET",endpoint="?fields=id,name")]
for req in requestss:
next_batch.add_request(req, success_callback, error_callback)
batches.append(next_batch)
for batch_request in batches:
batch_request.execute()
time.sleep(2)
print(batch_body_responses)
return batch_body_responses
df = pd.DataFrame(get_id_list(list_of_artists))
How can this code optimized by not using global arrays and how to execute without sleep statement and why it is needed sleep?

Variables assignement before function

I have created a package to quickly transform datas using pandas and xlsxwriter.
This worked pretty well and I did a few functions successfully. But recently I've hit a wall:
For a few functions I need to define variables first but they are not basic types (list, tuple, str etc.) but for instance a dataframe. I've looked into global variables and saw they're are not recommanded (and wouldn't know where to put them) and I also looked into classes but I don't know how to solve my problem using them. I've also tried creating an empty dataframe but got an empty dataframe after the function.
What I'm trying to do is a read function with pandas for .csv or .xlsx and a function for saving with Xlsxwriter engine.
The goal is to change as little as possible in the code to transform data frequently and rapidly (e.g. i have functions doing LEFT,RIGHT like in Excel or even MIDDLE with column numbers) and have an easy and short code in main.py.
Here is the stripped down version of my code which uses 2 python files (main.py and format_operations.py). I have added commentaries where I'm having issues.
Thanks in advance for your help!
"""
main.py
"""
import format_operations as tbfrm #import another python file in the same folder
import pandas as pd
import numpy as np
import xlsxwriter.utility
#file settings
file_full_path= "C:/Tests/big_data.xlsx"
file_save_to= "C:/Tests/Xlsxwriter.xlsx"
sheet_name_save_to= "Xlswriter"
dfname = ??? #I need to create the variable but I don't know how
tbfrm.FCT_universal_read(dfname,file_full_path) #CAN'T GET IT TO WORK
#column operations and formatting
columns_numeric = [3,6] # (with pandas) list of columns with number values by iloc number, starts at 0 which is column A in Excel
tbfrm.FCT_columns_numeric(dfname,columns_numeric) #example of a WORKING function (if dfname is defined)
#write with Xlsxwriter engine
XLWRITER_DF = ??? #same problem as before, how to create the variable?
workbookvarname = ??? #same here
worksheetvarname = ??? # same here
tbfrm.FCT_df_xlsxwriter(XLWRITER_DF,dfname,file_save_to,sheet_name_save_to,workbookvarname,worksheetvarname) #CAN'T GET IT TO WORK
#### WORKING piece of code I want to execute after saving with Xlsxwriter engine ####
worksheet.set_zoom(80)
# Conditional formatting
color_range_1 = "J1:J{}".format(number_rows+1)
FORMAT1 = workbook.add_format({'bg_color': '#FFC7CE','font_color': '#9C0006'})
FORMAT2 = workbook.add_format({'bg_color': '#C6EFCE','font_color': '#006100'})
worksheet.conditional_format(color_range_1, {'type': 'bottom','value': '5','format': FORMAT1})
worksheet.conditional_format(color_range_1, {'type': 'top','value': '5','format': FORMAT2})
Other file:
"""
format_operations.py
"""
import pandas as pd
import numpy as np
import xlsxwriter.utility
def FCT_universal_read(dfname,file_full_path):
if ".xls" in file_full_path:
dfname = pd.read_excel(file_full_path) #optional arguments:sheetname='Sheet1', header=0 , dtype=object to preserve values
if ".csv" in file_full_path:
dfname = pd.read_csv(file_full_path)
# save file with XLSXWriter engine for additional options to pandas
def FCT_df_xlsxwriter(XLWRITER_DF,dfname,file_save_to,sheet_name_save_to,workbookvarname,worksheetvarname):
XLWRITER_DF = pd.ExcelWriter(file_save_to, engine='xlsxwriter')
dfname.to_excel(XLWRITER_DF, sheet_name=sheet_name_save_to,encoding='utf-8')
workbookvarname = XLWRITER_DF.book
worksheetvarname = XLWRITER_DF.sheets[sheet_name_save_to]
#format as numbers
def FCT_columns_numeric(dfname,columns_numeric):
for x in columns_numeric:
dfname.iloc[:,x] = pd.to_numeric(dfname.iloc[:,x])
Your FCT_universal_read function should not modify a dataframe but instead return a new one:
def FCT_universal_read(file_full_path):
if file_full_path.split('.')[-1] == "xls":
df = pd.read_excel(file_full_path) #optional arguments:sheetname='Sheet1', header=0 , dtype=object to preserve values
if file_full_path.split('.')[-1] == "csv":
df = pd.read_csv(file_full_path)
return df
And in your main, do:
dfname = tbfrm.FCT_universal_read(file_full_path)
Same answer for FCT_df_xlsxwriter, you should rewrite it with a return so that you can do:
XLWRITER_DF, workbookvarname,worksheetvarname = tbfrm.FCT_df_xlsxwriter(dfname,file_save_to,sheet_name_save_to)
To grasp how python is dealing with the arguments you pass to a function, you should read these blog posts:
https://jeffknupp.com/blog/2012/11/13/is-python-callbyvalue-or-callbyreference-neither/
https://robertheaton.com/2014/02/09/pythons-pass-by-object-reference-as-explained-by-philip-k-dick/
You need to update FCT_universal_read so that it returns the dataframe you want. There is no need to define the dataframe outside the function, simply create and return it
df = FCT_universal_read('/your/file/path')
def FCT_universal_read(file_full_path):
if ".xls" in file_full_path:
df = pd.read_excel(file_full_path) #optional arguments:sheetname='Sheet1', header=0 , dtype=object to preserve values
return df
if ".csv" in file_full_path:
df = pd.read_csv(file_full_path)
return df
Thanks so much to both of you !! I get the logic now :)! Thanks also for the documentation.
I sucessfully managed to do both functions. I had been struggling for several hours.
I like the .split function that you used which ensures the script only looks at the extension.
I updated FCT_xlsxwriter and FCT_universal_read as you were saying. Here are both functions corrected:
'''
format_operations.py
'''
def FCT_universal_read(file_full_path):
if "xls" in file_full_path.split('.')[-1]:
dfname = pd.read_excel(file_full_path) #example: C:/Tests/Bigdata.xlsx
return dfname
if "csv" in file_full_path.split('.')[-1]:
dfname = pd.read_csv(file_full_path)
return dfname
def FCT_df_xlsxwriter(dfname,file_save_to,sheet_name_save_to):
XLWRITER_DF = pd.ExcelWriter(file_save_to, engine='xlsxwriter')
dfname.to_excel(XLWRITER_DF, sheet_name=sheet_name_save_to,encoding='utf-8')
workbook = XLWRITER_DF.book
worksheet = XLWRITER_DF.sheets[sheet_name_save_to]
return XLWRITER_DF,workbook,worksheet
Here is how I call the two functions:
'''
main.py
'''
import format_operations as tbfrm
import pandas as pd
import xlsxwriter.utility
#settings
file_full_path= "C:/Tests/big_data.xlsx"
file_save_to= "C:/Tests/Xlsxwriter.xlsx"
sheet_name_save_to= "Xlswriter"
#functions
FILE_DF = tbfrm.FCT_universal_read(file_full_path)
XLWRITER_DF,workbook,worksheet = tbfrm.FCT_df_xlsxwriter(FILE_DF,file_save_to,sheet_name_save_to)

Resources