Combining several dataframe results with for Loop in Python Pandas - python-3.x

Lets say I have these functions:
def query():
dict = (
{ "NO" : 1, "PART" : "ALPHA" },
{ "NO" : 2, "PART" : "BETA" }
)
finalqueryresult = pandas.DataFrame()
for info in dict: #I use this loop to request query depends on the dict data, in this example twice (2 records from dict)
finalqueryresult.append( sendquery(info["NO"], info["PART"]) )
def sendquery( no, part):
*some code to request query to server and save it under reqresult variable*
*.....*
*.....*
return reqresult
For example above, when sending first query (record with "NO" = 1) it will return: (lets say this is df1)
NAME COUNTRY
1 RYO JPN
2 JON NZ
and the last query (record with "NO" = 2): (lets say this df2)
NAME COUNTRY
1 TING CN
2 ASHYU INA
and what I want is finalqueryresult will be like this: (df1 combined with df2):
NAME COUNTRY
1 RYO JPN
2 JON NZ
3 TING CN
4 ASHYU INA
But I failed, the finalqueryresult is always empty. I suppose something is wrong with this:
for info in dict:
finalqueryresult.append( sendquery(info["NO"], info["PART"]) )

I think you need first append all DataFrames to list dfs and then use concat:
dfs= []
for info in dict:
#sendquery(info["NO"], info["PART"] return DataFrame
dfs.append( sendquery(info["NO"], info["PART"]) )
finalqueryresult = pd.concat(dfs, ignore_index=True)

Related

Iterating over an API response breaks for only one column in a Pandas dataframe

Problem: In my dataframe, when looping through zip codes in a weather API, I am getting the SAME values for column "desc" where every value is "cloudy" (this is incorrect for some zip codes). I think it is taking the value from the very last zip code in the list and applying it to every row in the Desc column.
But if I run only zip code 32303 and comment out all the other zip codes, the value for "Desc" is correct, it is now correctly listed as sunny/clear - - this proves the values when looping are incorrect.
Heck, it's Florida! ;)
Checking other weather sources I know "sunny/clear" is the correct value for 32303, not "cloudy". So for some reason, iterating is breaking on the column Desc only. I've tried so many options and am just stuck. Any ideas how to fix this?
import requests
import pandas as pd
api_key = 'a14ac278e4c4fdfd277a5b37e1dbe87a'
#Create a dictionary of zip codes for the team
zip_codes = {
55446: "You",
16823: "My Boo",
94086: "Your Boo",
32303: "Mr. Manatee",
95073: "Me"
}
# Create a list of zip codes
zip_list = list(zip_codes.keys())
# Create a list of names
name_list = list(zip_codes.values())
#For team data, create a pandas DataFrame from the dictionary
df1 = pd.DataFrame(list(zip_codes.items()),
columns=['Zip Code', 'Name'])
# Create empty lists to hold the API response data
city_name = []
description = []
weather = []
feels_like = []
wind = []
clouds = []
# Loop through each zip code
for zip_code in zip_list:
# Make a request to the OpenWeatherMap API
url = f"http://api.openweathermap.org/data/2.5/weather?zip={zip_code},us&units=imperial&appid={api_key}"
response = requests.get(url).json()
# Store the response data in the appropriate empty list
city_name.append(response['name'])
description = response['weather'][0]['main']
weather.append(response['main']['temp'])
feels_like.append(response['main']['feels_like'])
wind.append(response['wind']['speed'])
clouds.append(response['clouds']['all'])
# rain.append(response['humidity']['value'])
# For weather data, create df from lists
df2 = pd.DataFrame({
'City': city_name,
'Desc': description,
'Temp (F)': weather,
'Feels like': feels_like,
'Wind (mph)': wind,
'Clouds %': clouds,
# 'Rain (1hr)': rain,
})
# Merge df1 & df2, round decimals, and don't display index or zip.
df3=pd.concat([df1,df2],axis=1,join='inner').drop('Zip Code', axis=1)
df3[['Temp (F)', 'Feels like', 'Wind (mph)', 'Clouds %']] = df3[['Temp (F)', 'Feels like', 'Wind (mph)', 'Clouds %']].astype(int)
# Don't truncate df
pd.set_option('display.width', 150)
# Print the combined DataFrames
display(df3.style.hide_index())
Example output, note that "Desc" all have the same value "Clouds" but I know that is not correct as some are different.
Name City Desc Temp (F) Feels like Wind (mph) Clouds %
You Minneapolis Clouds 1 -10 12 100
My Boo Bellefonte Clouds 10 -1 15 100
Your Boo Sunnyvale Clouds 54 53 6 75
Mr. Manatee Tallahassee Clouds 49 49 3 0
Me Soquel Clouds 53 52 5 100
For example, if I comment out all the zip codes except for 32303: "Mr. Manatee", then I get a different value:
Name City Desc Temp (F) Feels like Wind (mph) Clouds %
Mr. Manatee Tallahassee Clear 49 49 3 0
To solve this, I tried another approach, below, which DOES give correct values for each zip code. The problem is that several of the columns are json values, and if I can't fix the code above, then I need to parse them and show only the relevant values. But my preference would be to fix the code above!
import requests
import pandas as pd
import json
zip_codes = {
95073: "Me",
55446: "You",
16823: "My Boo",
94086: "Your Boo",
32303: "Mr. Manatee"
}
import pandas as pd
import requests
# Create a list of zip codes
zip_list = list(zip_codes.keys())
# Create a list of names
name_list = list(zip_codes.values())
# Create a list of weather data
weather_list = []
# Set the API key
api_key = 'a14ac278e4c4fdfd277a5b37e1dbe87a'
# Get the weather data from the openweather API
for zip_code in zip_list:
api_url = f'http://api.openweathermap.org/data/2.5/weather?zip={zip_code},us&units=imperial&appid={api_key}'
response = requests.get(api_url).json()
weather_list.append(response)
# Create the dataframe
df = pd.DataFrame(weather_list)
# Add the name column
df['Name'] = name_list
# Parse the 'weather' column
#THIS DOESN'T WORK! df.weather.apply(lambda x: x[x]['main'])
# Drop unwanted columns
df.drop(['coord', 'base', 'visibility','dt', 'sys', 'timezone','cod'], axis=1)
I tried a different approach but got unusable json values. I tried various ways to fix looping in my first approach but I still get the same values for "Desc" instead of unique values corresponding to each zip code.
Like jqurious said, you had a bug in your code:
description = response['weather'][0]['main']
This means description stores the description of the final zip code in the dictionary and will repeat that across the whole dataframe. No wonder they are all the same.
Since you are collecting data to build a dataframe, it's better to use a list of dictionaries rather than a series of lists:
data = []
for zip_code in zip_list:
url = f"http://api.openweathermap.org/data/2.5/weather?zip={zip_code},us&units=imperial&appid={api_key}"
response = requests.get(url).json()
data.append({
"City": response["name"],
"Desc": response["weather"][0]["main"],
"Temp (F)": response["main"]["temp"],
"Feels like": response["main"]["feels_like"],
"Wind (mph)": response["wind"]["speed"],
"Clouds %": response["clouds"]["all"]
})
# You don't need to redefine the column names here
df2 = pd.DataFrame(data)

Find if any value from a list exists anywhere in a dataframe

I have a list of specific company identifications numbers.
ex. companyID = ['1','2','3']
and I have a dataframe of different attributes relating to company business.
ex. company_df
There are multiple columns where values from my list could be.
ex. 'company_number', 'company_value', 'job_referred_by', etc.
How can I check if any value from my companyID list exists anywhere in my company_df, regardless of datatype, and return only the columns where a companyID is found?
This is what I have tried, to no luck:
def find_any(company_df, companyID):
found = company_df.isin(companyID).any()
foundCols = found.index[found].tolist()
print(foundCols)
Create a df from your list of companyIDs and then merge the two dfs on company ID. Then filter the df to show only the rows that match.
For datatypes, you can convert int to string no problem, but the other way around would crash if you have a string that can't be converted to int (e.g., 'a'), so I'd use string.
Here's a toy example:
company_df = pd.DataFrame({'co_id': [1, 2, 4, 9]})
company_df['co_id'] = company_df['co_id'].astype(str)
companyID = ['1','2','3']
df_companyID = pd.DataFrame(companyID, columns=['co_id'])
company_df = company_df.merge(df_companyID, on='co_id', how='left', indicator=True)
print(company_df)
# co_id _merge
# 0 1 both
# 1 2 both
# 2 4 left_only
# 3 9 left_only
company_df_hits_only = company_df[company_df['_merge'] == 'both']
del company_df['_merge']
del company_df_hits_only['_merge']
print(company_df_hits_only)
# co_id
# 0 1
# 1 2

dataframe manipulation python based on conditons

input_df1: ID MSG
id-1 'msg1'
id-2 'msg2'
id-3 'msg3'
ref_df2: ID MSG
id-1 'msg1'
id-2 'xyzz'
id-4 'msg4'
I am trying to generate an output dataframe based on the following conditions:
If both 'id' & 'msg' values in input_df match the values in ref_df = matched
If 'id' value in input_df doesn't exists in ref_df = notfound
If only 'id' value in input_df matches with 'id' value in ref_df = not_matched
sample output: ID MSG flag
id-1 'msg1' matched
id-2 'msg2' not_matched
id-3 'msg3' notfound
I can do it using lists but considering the fact that I deal with huge amounts of data, performance is important, hence I am looking for a much faster solution.
Any little help will be highly appreciated
'''
Let's use map to map the ids to the reference messages and use np.select:
ref_msg = df1['ID'].map(df2.set_index('ID')['MSG'])
df1['flag'] = np.select((ref_msg.isna(), ref_msg==df1['MSG']),
('not found', 'matched'), 'not_matched')
Output (df1):
ID MSG flag
0 id-1 'msg1' matched
1 id-2 'msg2' not_matched
2 id-3 'msg3' not found
You can also use indicator=True parameter of df.merge:
In [3867]: x = df1.merge(df2, how='outer', indicator=True).groupby('ID', as_index=False).last()
In [3864]: d = {'both':'matched', 'right_only':'not_matched', 'left_only':'notfound'}
In [3869]: x._merge = x._merge.map(d)
In [3871]: x
Out[3871]:
ID MSG _merge
0 id-1 'msg1' matched
1 id-2 'xyzz' not_matched
2 id-3 'msg3' notfound
The fastest and the most Pythonic way of doing what you want to do is to use dictionaries, as shown below:
list_ID_in = ['id-1', 'id-2', 'id-3']
list_msg_in = ['msg1', 'msg2', 'msg3']
list_ID_ref = ['id-1', 'id-2', 'id-4']
list_msg_ref = ['msg1', 'xyzz', 'msg4']
dict_in = {k:v for (k, v) in zip(list_ID_in, list_msg_in)}
dict_ref = {k:v for (k, v) in zip(list_ID_ref, list_msg_ref)}
list_out = [None] * len(dict_in)
for idx, key in enumerate(dict_in.keys()):
try:
ref_value = dict_ref[key]
if ref_value == dict_in[key]:
list_out[idx] = 'matched'
else:
list_out[idx] = 'not_matched'
except KeyError:
list_out[idx] = 'not_found'

Most efficient way to compare two panda data frame and update one dataframe based on condition

I have two dataframe df1 and df2. df2 consist of "tagname" and "value" column. Dictionary "bucket_dict" holds the data from df2.
bucket_dict = dict(zip(df2.tagname,df2.value))
In a df1 there are millions of row.3 columns are there "apptag","comments" and "Type" in df1. I want to match between this two dataframes like, if
"dictionary key" from bucket_dict contains in df1["apptag"] then update the value of df1["comments"] = corresponding dictionary key
and df1["Type"] = corresponding bucket_dict["key name"]
. I used below code:
for each_tag in bucket_dict:
df1.loc[(df1["apptag"].str.match(each_tag, case = False ,na = False)), "comments"] = each_tag
df1.loc[(df1["apptag"].str.match(each_tag, case = False ,na = False)), "Type"] = bucket_dict[each_tag]
Is there any efficient way to do this since it's taking longer time.
Bucketing df from which dictionary has been created:
bucketing_df = pd.DataFrame([["pen", "study"], ["pencil", "study"], ["ersr","study"],["rice","grocery"],["wht","grocery"]], columns=['tagname', 'value'])
other dataframe:
output_df = pd.DataFrame([["test123-pen", "pen"," "], ["test234-pencil", "pencil"," "], ["test234-rice","rice", " "], columns=['apptag', 'comments','type'])
Required output:
You can do this by calling an apply on your comments column along with a loc on your bucketing_df in this manner -
def find_type(a):
try:
return (bucketing_df.loc[[x in a for x in bucketing_df['tagname']]])['value'].values[0]
except:
return ""
def find_comments(a):
try:
return (bucketing_df.loc[[x in a for x in bucketing_df['tagname']]])['tagname'].values[0]
except:
return ""
output_df['type'] = output_df['apptag'].apply(lambda a: find_type(a))
output_df['comments'] = output_df['apptag'].apply(lambda a:find_comments(a))
Here I had to make them separate functions so it could handle cases where no tagname existed in apptag
It gives you this as the output_df -
apptag comments type
0 test123-pen pen study
1 test234-pencil pencil study
2 test234-rice rice grocery
All this code uses is the existing bucketing_df and output_df you provided at the end of your question.

How to create multiple dataframes using multiple functions

I quite often write a function to return different dataframes based on the parameters I enter. Here's an example dataframe:
np.random.seed(1111)
df = pd.DataFrame({
'Category':np.random.choice( ['Group A','Group B','Group C','Group D'], 10000),
'Sub-Category':np.random.choice( ['X','Y','Z'], 10000),
'Sub-Category-2':np.random.choice( ['G','F','I'], 10000),
'Product':np.random.choice( ['Product 1','Product 2','Product 3'], 10000),
'Units_Sold':np.random.randint(1,100, size=(10000)),
'Dollars_Sold':np.random.randint(100,1000, size=10000),
'Customer':np.random.choice(pd.util.testing.rands_array(10,25,dtype='str'),10000),
'Date':np.random.choice( pd.date_range('1/1/2016','12/31/2018',
freq='M'), 10000)})
I then created a function to perform sub-totals for me like this:
def some_fun(DF1, agg_column, myList=[], *args):
y = pd.concat([
DF1.assign(**{x:'[Total]' for x in myList[i:]})\
.groupby(myList).agg(sumz = (agg_column,'sum')) for i in range(1,len(myList)+1)]).sort_index().unstack(0)
return y
I then write out lists that I'll pass as arguments to the function:
list_one = [pd.Grouper(key='Date',freq='A'),'Category','Product']
list_two = [pd.Grouper(key='Date',freq='A'),'Category','Sub-Category','Sub-Category-2']
list_three = [pd.Grouper(key='Date',freq='A'),'Sub-Category','Product']
I then have to run each list through my function creating new dataframes:
df1 = some_fun(df,'Units_Sold',list_one)
df2 = some_fun(df,'Dollars_Sold',list_two)
df3 = some_fun(df,'Units_Sold',list_three)
I then use a function to write each of these dataframes to an Excel worksheet. This is just an example - I perform this same exercise 10+ times.
My question - is there a better way to perform this task than to write out df1, df2, df3 with the function information applied? Should I be looking at using a dictionary or some other data type to do this my pythonically with a function?
A dictionary would be my first choice:
variations = ([('Units Sold', list_one), ('Dollars_Sold',list_two),
..., ('Title', some_list)])
df_variations = {}
for i, v in enumerate(variations):
name = v[0]
data = v[1]
df_variations[i] = some_fun(df, name, data)
You might further consider setting the keys to unique / helpful titles for the variations, that goes beyond something like 'Units Sold', which isn't unique in your case.
IIUC,
as Thomas has suggested we can use a dictionary to parse through your data, but with some minor modifications to your function, we can use the dictionary to hold all the required data then pass that through to your function.
the idea is to pass two types of keys, the list of columns and the arguments to your pd.Grouper call.
data_dict = {
"Units_Sold": {"key": "Date", "freq": "A"},
"Dollars_Sold": {"key": "Date", "freq": "A"},
"col_list_1": ["Category", "Product"],
"col_list_2": ["Category", "Sub-Category", "Sub-Category-2"],
"col_list_3": ["Sub-Category", "Product"],
}
def some_fun(dataframe, agg_col, dictionary,column_list, *args):
key = dictionary[agg_col]["key"]
frequency = dictionary[agg_col]["freq"]
myList = [pd.Grouper(key=key, freq=frequency), *dictionary[column_list]]
y = (
pd.concat(
[
dataframe.assign(**{x: "[Total]" for x in myList[i:]})
.groupby(myList)
.agg(sumz=(agg_col, "sum"))
for i in range(1, len(myList) + 1)
]
)
.sort_index()
.unstack(0)
)
return y
Test.
df1 = some_fun(df,'Units_Sold',data_dict,'col_list_3')
print(df1)
sumz
Date 2016-12-31 2017-12-31 2018-12-31
Sub-Category Product
X Product 1 18308 17839 18776
Product 2 18067 19309 18077
Product 3 17943 19121 17675
[Total] 54318 56269 54528
Y Product 1 20699 18593 18103
Product 2 18642 19712 17122
Product 3 17701 19263 20123
[Total] 57042 57568 55348
Z Product 1 19077 17401 19138
Product 2 17207 21434 18817
Product 3 18405 17300 17462
[Total] 54689 56135 55417
[Total] [Total] 166049 169972 165293
as you want to automate the writing of the 10x worksheets, we can again do that with a dictionary call over your function:
matches = {'Units_Sold': ['col_list_1','col_list_3'],
'Dollars_Sold' : ['col_list_2']}
then a simple for loop to write all the files to a single excel sheet, change this to match your required behavior.
writer = pd.ExcelWriter('finished_excel_file.xlsx')
for key,value in matches.items():
for items in value:
dataframe = some_fun(df,k,data_dict,items)
dataframe.to_excel(writer,f'{key}_{items}')
writer.save()

Resources