Is there a way to find specific values in a table and make it a df? - python-3.x

I have an excel file which looks like this. I want to read it and take the dataframe1 with data1 in a df and dataframe2 with data2 in another df. The location of these data can be random. Is there a way to find it and make it a df?
the tables does not exist in the same position always but the headers are always consistent.
import pandas as pd
from openpyxl import load_workbook
wb = load_workbook('Book1.xlsx',data_only=True)
ws = wb.active
df = pd.DataFrame()
for row in ws.iter_rows():
for cell in row:
if cell.value == 'data1':
print('found')
df = pd.read_excel('test123.xlsx', sheet_name='Sheet1', skiprows=cell.row, nrows=2, usecols=range(cell.col-1,cell.col+3))
print(df)

Here is a class that should work. data_range arg should be a range much larger than where the data is located.
import re
import numpy as np
import pandas as pd
from openpyxl import load_workbook
from openpyxl.utils import get_column_interval
class ExcelDataFinder:
"""
param: data_range: str
param: search_for: list or str
return: list or pd.DataFrame
"""
def __init__(self, **kwargs):
path = "/Users/joeblow/Documents/test.xlsx"
self.work_book = load_workbook(filename=path, read_only=True, data_only=True)
self.data_range = kwargs.get("data_range")
self.search_for = kwargs.get("search_for")
def main(self) -> pd.DataFrame or list:
return self.find_data_from_list() if isinstance(self.search_for, list) else self.find_data_from_string()
def load_workbook_range(self) -> pd.DataFrame:
col_start, col_end = re.findall("[A-Z]+", self.data_range)
data_rows = [[x.value for x in row] for row in self.work_book.active[self.data_range]]
return pd.DataFrame(data_rows, columns=get_column_interval(col_start, col_end))
def find_data_from_string(self) -> pd.DataFrame:
df = self.load_workbook_range().fillna(np.nan)
df = df[np.where(df.eq(self.search_for))[0][0]:]
df = df.rename(columns=df.iloc[0]).drop(df.index[0]).reset_index(drop=True)
try:
return df.drop(columns=np.nan).dropna(how="all", axis=0).dropna(how="all", axis=1).reset_index(drop=True)
except KeyError:
pass
return df
def find_data_from_list(self) -> list:
dfs = []
for value in self.search_for:
df = self.load_workbook_range().fillna(np.nan)
df = df[np.where(df.eq(value))[0][0]:]
df = df.rename(columns=df.iloc[0]).drop(df.index[0]).reset_index(drop=True)
try:
df = df.drop(columns=np.nan).dropna(how="all", axis=0).dropna(how="all", axis=1).reset_index(drop=True)
except KeyError:
pass
dfs.append(df)
return dfs
if __name__ == "__main__":
df1, df2 = ExcelDataFinder(data_range="A1:Z5000", search_for=["dataframe1", "dataframe2"]).main()
print(f"{df1}\n")
print(df2)
dataframe1 data1
0 A610 656
1 B655 353
2 C698 876
dataframe2 data2
0 A611 654
1 B646 454
2 C694 796

Related

How to add entire dataframe row as scatter plot annotation

I'm plotting two columns of a Pandas DataFrame on a scatterplot and I want each point to show all the row values of the DataFrame. I've looked at this post, and tried to do something similar with mplcursors:
import pandas as pd
from datetime import date, datetime, time, timedelta
import numpy as np
import matplotlib.pyplot as plt
from mplcursors import cursor
df = pd.DataFrame()
df['datetime'] = pd.date_range(start='2016-01-01', end='2016-01-14', freq='30T')
#df = df.set_index('datetime')
df['x1'] = np.random.randint(-30, 30, size=len(df))
df['x2'] = np.random.randint(-30, 20, size=len(df))
df['x3'] = np.random.randint(-20, 30, size=len(df))
df['y1'] = np.random.randint(-100, 100, size=len(df))
df['y2'] = np.random.randint(-300, 200, size=len(df))
df['y3'] = np.random.randint(-200, 300, size=len(df))
def conditions(s):
if (s['y1'] > 20) or (s['y3'] < 0):
return 'group1'
elif (s['x3'] < 20):
return 'group2'
elif (s['x2'] == 0):
return 'group3'
else:
return 'group4'
df['category'] = df.apply(conditions, axis=1)
fig = plt.figure(figsize=(12,4))
ax1 = plt.subplot(121)
ax1.scatter(df.x1, df.y1, label='test1')
ax1.scatter(df.x2, df.y2, label='test2')
#cursor(hover=True)
ax1.set_xlabel('test1')
ax1.set_ylabel('test2')
ax1.legend(['test1','test2'])
cr1 = cursor(ax1,hover=True)
#ax1.annotation_names = df.columns.tolist()
cr1.connect("add", lambda x: x.annotation.set_text(df.columns.tolist()[x.target.index]))
ax2 = plt.subplot(122)
ax2.scatter(df.x1, df.y1, label='test1')
ax2.scatter(df.x3, df.y3, label='test3')
ax2.set_xlabel('test1')
ax2.set_ylabel('test3')
ax2.legend(['test1','test3'])
cr2 = cursor(ax2,hover=True)
#ax2.annotation_names = df.columns.tolist()
cr2.connect("add", lambda x: x.annotation.set_text(df.columns.tolist()[x.target.index]))
# save figure
import pickle
pickle.dump(fig, open('FigureObject.fig.pickle', 'wb'))
plt.show()
When I hover over a point, I want to see a label containing (for example):
datetime = 2016-01-01 00:00:00
x1 = 1
x2 = -4
x3 = 22
y1 = -42
y2 = -219
y3 = -158
category = group1
but I get this type of error:
cr2.connect("add", lambda x: x.annotation.set_text(df.columns.tolist()[x.target.index]))
IndexError: list index out of range
How do I fix it?
The IndexError occurs because of df.columns.tolist()[x.target.index]
df.columns.tolist() is a list of 7 columns, which is then indexed by [x.target.index].
df.iloc[x.target.index, :].to_dict() will get the desired row data for the point as a dict
A list comprehension creates a list of strings for each key value pair
'\n'.join(...) creates a string with each column separated by a \n
In mplcursors v0.5.1, Selection.target.index is deprecated, use Selection.index instead.
df.iloc[x.index, :] instead of df.iloc[x.target.index, :]
cr1.connect("add", lambda x: x.annotation.set_text('\n'.join([f'{k}: {v}' for k, v in df.iloc[x.index, :].to_dict().items()])))
Alternatively, use .to_string()
cr1.connect("add", lambda x: x.annotation.set_text(df.iloc[x.index, :].to_string()))

Python - Load multiple excel files with multiple sheets in it with specific columns

I have a problem scenario where I need to load excel files using Python
Load multiple excel files from a folder - Done
Each excel file has multiple sheets - Done
Need to load only required columns ('Receive Date','Process Date','Process Number','Task Name','Series','Office','Department','Unit Manager','AM'), other columns needs to be ignored/dropped and no error should be raised if the above columns does not exist in some sheets.
Load all the data into single data frame
------ Code -------
import pandas as pd
import os
import glob
def getfilepath():
path = 'C:/Users/Tracking Logs/'
files=(os.listdir(path))
allfiles = glob.glob(path+"*.xlsx")
def getdatafromexcel():
for file in allfiles:
rawdf = pd.read_excel(file,sheet_name=None,na_values='null',keep_default_na=False,dtype=object,date_parser=True)
cols=('Receive Date','Process Date','Process Number','Task Name','Series','Office','Department','Unit Manager','AM/AA/PC')
display(df)
getfilepath()
getdatafromexcel()
I found the solution:
import pandas as pd
import os
import glob
from IPython.display import HTML,display
from openpyxl import load_workbook
path = 'C:/Users/Tracking Logs/'
cols = ['Receive Date','Process Date','Task Name','Series','Office','Department','Unit Manager','AM/AA/PC']
def getfilepath(path):
files=(os.listdir(path))
allfiles = glob.glob(path+"*.xlsx")
#print('Allfiles: ',allfiles)
return allfiles
def getdatafromexcel(cols,allfiles):
for i in range(len(allfiles)):
print('\nCounter: ',i,' \nFilenames: ',allfiles[i])
wb = load_workbook(allfiles[i],read_only=True)
for sheetname in wb.sheetnames:
print('Sheetname: ',sheetname)
try:
df = pd.read_excel(allfiles[i],sheet_name=sheetname,na_values='null',usecols=cols,
keep_default_na=False,dtype=object)
Indexnames = df[(df["Task Name"] == '') & (df["Series"] == '') & (df["Office"] == '')].index
df.drop(Indexnames,inplace=True)
display(df)
fulldf=fulldf.append(df,ignore_index=True)
except Exception as e:
print(e)
finally:
print('this executed')
wb.close()
display(fulldf)
allfiles = getfilepath(path)
getdatafromexcel(cols,allfiles)
One can use pd.ExcelFile and pd.read_excel to get the required results.
def getdatafromexcel():
for file in allfiles:
xl = pd.ExcelFile(file)
res = len(xl.sheet_names)
if res>1:
for i in range(1, res+1):
df = pd.read_excel(file, sheet_name= '%d' %i)
# Do selection, preprocessing what you want here
if i == 1:
df.to_csv(<your_path> + '1.csv')
df_1 = pd.read_csv(<your_path> + '1.csv')
if i > 1:
df_1 = pd.concat([df_1, df])
else:
df_1 = pd.read_excel(file)
# Do selection, preprocessing what you what here
df_1.to_csv(<your_path> + '.csv', index= False)

How to create a DataFrame from a list that each column is created by a regex expression

I have a list as such:
lst = ['2021_01_21__11_10_54_1__13928_snapshot.jpg',
'2021_01_21__12_27_44_1__13934_snapshot.jpg',
'2021_01_21__11_11_08_2__13928_snapshot.jpg',
'2021_01_21__12_27_56_2__13934_snapshot.jpg',
'2021_01_21__11_11_19_3__13928_snapshot.jpg',
'2021_01_21__12_28_08_3__13934_snapshot.jpg']
I want to create a DataFrame so that each column will be represented by:
def by_number(path):
base_name = os.path.basename(path)
return re.findall('[\_]{2}(\d{5})',lst)
And the rows will be represented by:
def by_index(path):
base_name = os.path.basename(path)
return re.findall('\_(\d)[\_]{2}',lst)
So eventually I'll get a DataFrame that looks something like this:
name_list = ['2021_01_21__11_10_54_1__13928_snapshot.jpg',
'2021_01_21__12_27_44_1__13934_snapshot.jpg',
'2021_01_21__11_11_08_2__13928_snapshot.jpg',
'2021_01_21__12_27_56_2__13934_snapshot.jpg',
'2021_01_21__11_11_19_3__13928_snapshot.jpg',
'2021_01_21__12_28_08_3__13934_snapshot.jpg']
import re
import pandas as pd
df = pd.DataFrame([[0]], columns=['count']) # initialize dataframe
for name in name_list:
count = re.search('\_(\d)[\_]{2}',name).group(1)
col = re.search('[\_]{2}(\d{5})',name).group(1)
if ((df['count'] == count)).any():
df.loc[df['count'] == count, col] = name
else:
new_row = pd.DataFrame([[count,name]], columns=['count',col])
df = df.append(new_row)
df.set_index('count', inplace=True)
print(df)

How to apply a function fastly on the list of DataFrame in Python?

I have a list of DataFrames with equal length of columns and rows but different values, such as
data = [df1, df2,df3.... dfn] .
How can I apply a function function on each dataframe in the list data? I used following code but it doe not work
data = [df1, def2,df3.... dfn]
def maxloc(data):
data['loc_max'] = np.zeros(len(data))
for i in range(1,len(data)-1): #from the second value on
if data['q_value'][i] >= data['q_value'][i-1] and data['q_value'][i] >= data['q_value'][i+1]:
data['loc_max'][i] = 1
return data
df_list = [df.pipe(maxloc) for df in data]
Seems to me the problem is in your maxloc() function as this code works.
I added also the maximum value in the return of maxloc.
from random import randrange
import pandas as pd
def maxloc(data_frame):
max_index = data_frame['Value'].idxmax(0)
maximum = data_frame['Value'][max_index]
return max_index, maximum
# create test list of data-frames
data = []
for i in range(5):
temp = []
for j in range(10):
temp.append(randrange(100))
df = pd.DataFrame({'Value': temp}, index=(range(10)))
data.append(df)
df_list = [df.pipe(maxloc) for df in data]
for i, (index, value) in enumerate(df_list):
print(f"Data-frame {i:02d}: maximum = {value} at position {index}")

Import and parse .data file

there is a file I tried to import and safe as pandas df. At a first sight looks like it's already columns and rows ordered, but finally I had to do a bunch of stuff to create pandas df. Could you please check if there is much faster way to manage it?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
My way of doing it is:
import requests
import pandas as pd
r = requests.get(url)
file = r.text
step_1 = file.split('\n')
for n in range(len(step_1)): # remove empty strings
if bool(step_1[n]) == False:
del(step_1[n])
step_2 = [i.split('\t') for i in step_1]
cars_names = [i[1] for i in step_2]
step_3 = [i[0].split(' ') for i in step_2]
for e in range(len(step_3)): # remove empty strings in each sublist
step_3[e] = [item for item in step_3[e] if item != '']
mpg = [i[0] for i in step_3]
cylinders = [i[1] for i in step_3]
disp = [i[2] for i in step_3]
horsepower = [i[3] for i in step_3]
weight = [i[4] for i in step_3]
acce = [i[5] for i in step_3]
year = [i[6] for i in step_3]
origin = [i[7] for i in step_3]
list_cols = [cars_names, mpg, cylinders, disp, horsepower, weight, acce, year, origin]
# list_labels written manually:
list_labels = ['car name', 'mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin']
zipped = list(zip(list_labels, list_cols))
data = dict(zipped)
df = pd.DataFrame(data)
When you replaced \t to blankspace, you can use read_csv to read it. But you need to wrap up your text, because the first parameter in read_csv is filepath_or_buffer which needs object with a read() method (such as a file handle or StringIO). Then your question can be transform to read_csv doesn't read the column names correctly on this file?
import requests
import pandas as pd
from io import StringIO
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
r = requests.get(url)
file = r.text.replace("\t"," ")
# list_labels written manually:
list_labels = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin','car name']
df = pd.read_csv(StringIO(file),sep="\s+",header = None,names=list_labels)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(df)

Resources