DataFrame append generates TypeError - python-3.x

I am trying to write a function to write and read transaction details to/from a .h5 file. I want to effectively use one file to store some transaction details, and when necessary, derive the details. Here's my code:
import h5py
import numpy as np
import pandas as pd
from datetime import datetime
from os import listdir
from pandas import HDFStore
def maintainLedger(mode, tick, lastBuyy = 0, lastSell = 0, quan = 0, prof = 0):
"""THIS FUNCTION WRITES AND READS TRANSACTION DETAILS.
mode = 0 - IF FILE EXITS, READ FILE
mode = 1 - IF FILE EXITS, APPEND TO FILE"""
# CHECK IF LEDGER FILE EXISTS, IF NOT CREATE A LEDGER FILE FOR THE FIRST TIME
path = r'ledger'
suff = r'h5'
flie = listdir(path)
flie = [item for item in flie if item.endswith(suff)]
if len(flie) == 0:
HDF5Data = HDFStore('ledger/ledger.h5')
# GENERATE NEW VALUES OF DATE/TIME
mi = int(datetime.now().minute)
ho = int(datetime.now().hour)
da = int(datetime.now().day)
we = int(datetime.now().isocalendar()[1])
mo = int(datetime.now().month)
ye = int(datetime.now().year)
newwData = np.array([mode, mi, ho, da, we, mo, ye, tick, lastBuyy, lastSell, quan, prof]).reshape(1, 12)
newwData = pd.DataFrame(newwData, columns = ['mode', 'mi', 'ho', 'da', 'we', 'mo', 'ye', 'tick', 'laBu', 'laSe', 'quan', 'prof'])
HDF5Data.put('data', newwData, format = 'table', data_columns = True)
HDF5Data.close()
elif len(flie) == 1:
if mode == 0:
# READ PREVIOUSLY SAVED DATA AS PANDAS DATAFRAME
readData = pd.read_hdf('ledger/ledger.h5', mode = 'r')
# DO SOMETHING...
elif mode == 1:
# GENERATE NEW VALUES OF DATE/TIME
mi = int(datetime.now().minute)
ho = int(datetime.now().hour)
da = int(datetime.now().day)
we = int(datetime.now().isocalendar()[1])
mo = int(datetime.now().month)
ye = int(datetime.now().year)
# GATHER NEW DATA INTO NUMPY ARRAY AND CONVERT TO PANDAS DATAFRAME
newwData = np.array([mode, mi, ho, da, we, mo, ye, tick, lastBuyy, lastSell, quan, prof]).reshape(1, 12)
newwData = pd.DataFrame(newwData, columns = ['mode', 'mi', 'ho', 'da', 'we', 'mo', 'ye', 'tick', 'laBu', 'laSe', 'quan', 'prof'])
# READ PREVIOUSLY SAVED DATA AS PANDAS DATAFRAME AND APPEND NEW DATA
readData = pd.read_hdf('ledger/ledger.h5', mode = 'a')
readData.append('data', newwData)
tempData = pd.read_hdf('ledger/ledger.h5', mode = 'r')
print(tempData)
else:
print('Please check input data for errors!')
if __name__ == '__main__':
maintainLedger(1, "AAPL")
When I run the code, I am getting the following error:
TypeError: cannot concatenate object of type "<class 'str'>"; only pd.Series, pd.DataFrame, and pd.Panel (deprecated) objs are valid
I have tried looking for a solution, and a quick search led me to this, which didn't solve my problem. Is there something I am doing wrong? Any advice would be appreciated.

import h5py
import numpy as np
import pandas as pd
from datetime import datetime
from os import listdir
from pandas import HDFStore
def maintainLedger(mode, tick = 'QUERY', lastBuyy = 0, lastSell = 0, quan = 0, prof = 0):
"""THIS FUNCTION WRITES AND READS TRANSACTION DETAILS.
mode = 0 - IF FILE EXITS, READ FILE
mode = 1 - IF FILE EXITS, APPEND TO FILE"""
# CHECK IF LEDGER FILE EXISTS, IF NOT CREATE A LEDGER FILE FOR THE FIRST TIME
path = r'ledger'
suff = r'h5'
flie = listdir(path)
flie = [item for item in flie if item.endswith(suff)]
if len(flie) == 0:
# GENERATE NEW VALUES OF DATE/TIME
mi = int(datetime.now().minute)
ho = int(datetime.now().hour)
da = int(datetime.now().day)
we = int(datetime.now().isocalendar()[1])
mo = int(datetime.now().month)
ye = int(datetime.now().year)
# GATHER NEW DATA INTO NUMPY ARRAY AND CONVERT TO PANDAS DATAFRAME
newwData = np.array([mode, mi, ho, da, we, mo, ye, tick, lastBuyy, lastSell, quan, prof]).reshape(1, 12)
newwData = pd.DataFrame(newwData, columns = ['mode', 'mi', 'ho', 'da', 'we', 'mo', 'ye', 'tick', 'laBu', 'laSe', 'quan', 'prof'])
# SAVE ALL DATA INTO .H5 FORMAT
HDF5Data = HDFStore('ledger/ledger.h5')
HDF5Data.put('data', newwData, format = 'table', data_columns = True)
HDF5Data.close()
elif len(flie) == 1:
if mode == 0:
"""THIS OPTION ENABLES CODE TO READ DATA."""
# READ PREVIOUSLY SAVED DATA AS PANDAS DATAFRAME
readData = pd.read_hdf('ledger/ledger.h5', mode = 'r')
# DO SOMETHING...
print(readData)
elif mode == 1:
"""THIS OPTION ENABLES CODE TO APPEND DATA."""
# GENERATE NEW VALUES OF DATE/TIME
mi = int(datetime.now().minute)
ho = int(datetime.now().hour)
da = int(datetime.now().day)
we = int(datetime.now().isocalendar()[1])
mo = int(datetime.now().month)
ye = int(datetime.now().year)
# GATHER NEW DATA INTO NUMPY ARRAY AND CONVERT TO PANDAS DATAFRAME
newwData = np.array([mode, mi, ho, da, we, mo, ye, tick, lastBuyy, lastSell, quan, prof]).reshape(1, 12)
newwData = pd.DataFrame(newwData, columns = ['mode', 'mi', 'ho', 'da', 'we', 'mo', 'ye', 'tick', 'laBu', 'laSe', 'quan', 'prof'])
# READ PREVIOUSLY SAVED DATA AS PANDAS DATAFRAME AND APPEND NEW DATA
readData = pd.read_hdf('ledger/ledger.h5', mode = 'r')
readData = readData.append(newwData)
# SAVE ALL DATA INTO .H5 FORMAT
HDF5Data = HDFStore('ledger/ledger.h5')
HDF5Data.put('data', readData, format = 'table', data_columns = True)
HDF5Data.close()
else:
print('Please check input data for errors!')
if __name__ == '__main__':
maintainLedger(1, 'MSFT')

Related

Perform code on multiple files 1 by 1 pandas

Hi I have code I have written to read a .csv file in a folder and add some required columns.
I now want to perform this code on multiple files within the path folder 1 by 1 and save each as a separate df.
My current code is as follows
import pandas as pd
import glob
import os
path = r'C:\Users\jake.jennings.BRONCO\Desktop\GPS Reports\Games\Inputs\2022-03-27 Vs
Cowboys\Test' # use your path
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
frame = pd.read_csv(filename, index_col=None, skiprows=8)
li.append(frame)
frame = pd.concat(li, axis=0, ignore_index=True)
frame['filename'] = os.path.basename
#Add odometer change and turn all accel values to positive
import numpy as np
frame['OdChange'] = frame['Odometer'].diff()
frame['accelpos'] = frame['Acceleration'].abs()
#Add column with OdChange # >5.5m/s
frame["new1"] = np.where(
(frame.Velocity >=5.5),
frame["OdChange"],
'0')
#Add column with accels/decels >2.5m.s.s for AccelDec/min
frame["new2"] = np.where(
(frame.accelpos >=2.5),
frame["accelpos"],
'0')
#Add column with accels/decels >2.5m.s.s for AccelDec/min
frame["new3"] = np.where(
(frame.Acceleration >=2.5),
'1',
'0')
s = frame['new3'].astype(int)
frame['new4'] = s.diff().fillna(s).eq(1).astype(int)
frame['new4']
#m/min peaks
frame['1minOD'] = frame['OdChange'].rolling(window=600, axis=0).sum()
#HSm/min peaks
frame['1minHS'] = frame['new1'].rolling(window=600, axis=0).sum()
#AccImpulse/min
frame['1minImp'] = frame['accelpos'].rolling(window=600, axis=0).mean() *60
#AccDec Peak Count
frame['1minAccCount'] = frame['new4'].rolling(window=600, axis=0).sum()
print (frame)
I am not sure if this is even the best way to do what I am trying to do. Any help would be appreciated!

Reading in multiple files in Python and saving them one by one in a different directory

import glob
import pandas as pd
import seaborn as sns
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
files = glob.glob("Angular_position_*_*.csv")
output = pd.DataFrame()
for f in files:
df = pd.read_csv(f)
time = df.iloc[:,0]
time = time.to_numpy()
ynew = df.iloc[:,1:]
ynew = ynew.to_numpy()
lowPassCutoffFreq = 6.0 # Cut off frequency
Sample_freq = 150; #Target sample frequency
N = 2 # Order of the filter; In this case 2nd order
Wn = lowPassCutoffFreq/(Sample_freq/2) #Normalize frequency
b, a = signal.butter(5, Wn, btype='low',analog=False,output='ba')
#scipy.signal.butter(N, Wn, btype='low', analog=False, output='ba', fs=None)
output = signal.filtfilt(b, a, ynew, axis=0)
np.savetxt("enter directory path/Filtered_files/Filtered_Angular_position_*_*", output, delimiter = ', ', newline = "\n")
I am trying to read in all files in a directory, they are then low pass filtered. After that the results are saved one after the other but not in one file. The result gives each files with 3 columns and ideally I would like them to named with headers e.g. col1, col2, col3.
Without using glob, I can filter all my files individually but I have more than 100 such files.
Any help would be appreciated.
best wishes,
I have partially solved the issue apart from the header names:
import glob
import pandas as pd
from tnorma import tnorma
import seaborn as sns
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
path = r'location_of_dir'
all_files = glob.glob(path + '/*.csv')
# yn = np.zeros(shape = (101,1))
# tn = np.zeros(shape = (101,1))
#ynew = []
yn = np.zeros(shape = (101,1))
for filename in all_files:
df = pd.read_csv(filename, index_col=None, header=0)
print(filename)
foo = filename.split("/")[-1]
#df = pd.read_csv(f)
time = df.iloc[:,0]
time = time.to_numpy()
ynew = df.iloc[:,1:]
ynew = ynew.to_numpy()
#print(ynew)
lowPassCutoffFreq = 6.0 # Cut off frequency
Sample_freq = 150; #Target sample frequency
N = 2 # Order of the filter; In this case 2nd order
Wn = lowPassCutoffFreq/(Sample_freq/2) #Normalize frequency
b, a = signal.butter(5, Wn, btype='low',analog=False,output='ba')
#scipy.signal.butter(N, Wn, btype='low', analog=False, output='ba', fs=None)
output = signal.filtfilt(b, a, ynew, axis=0)
#print (output)
tn = np.linspace(0, 100, 101) # new time vector for the new time-normalized data
yn, tn, indie = tnorma(output, k=3, smooth =1, mask = None, show = False)
np.savetxt("path_name/foldername/file"+ foo, yn, delimiter = ', ', newline = "\n")
However, I am having difficulty in putting header names on the 3 columns per file.

How can I speed these API queries up?

I am feeding a long list of inputs in a function that calls an API to retrieve data. My list is around 40.000 unique inputs. Currently, the function returns output every 1-2 seconds or so. Quick maths tells me that it would take over 10+ hrs before my function will be done. I therefore want to speed this process up, but have struggles finding a solution. I am quite a beginner, so threading/pooling is quite difficult for me. I hope someone is able to help me out here.
The function:
import quandl
import datetime
import numpy as np
quandl.ApiConfig.api_key = 'API key here'
def get_data(issue_date, stock_ticker):
# Prepare var
stock_ticker = "EOD/" + stock_ticker
# Volatility
date_1 = datetime.datetime.strptime(issue_date, "%d/%m/%Y")
pricing_date = date_1 + datetime.timedelta(days=-40) # -40 days of issue date
volatility_date = date_1 + datetime.timedelta(days=-240) # -240 days of issue date (-40,-240 range)
# Check if code exists : if not -> return empty array
try:
stock = quandl.get(stock_ticker, start_date=volatility_date, end_date=pricing_date) # get pricing data
except quandl.errors.quandl_error.NotFoundError:
return []
daily_close = stock['Adj_Close'].pct_change() # returns using adj.close
stock_vola = np.std(daily_close) * np.sqrt(252) # annualized volatility
# Average price
stock_pricing_date = date_1 + datetime.timedelta(days=-2) # -2 days of issue date
stock_pricing_date2 = date_1 + datetime.timedelta(days=-12) # -12 days of issue date
stock_price = quandl.get(stock_ticker, start_date=stock_pricing_date2, end_date=stock_pricing_date)
stock_price_average = np.mean(stock_price['Adj_Close']) # get average price
# Amihuds Liquidity measure
liquidity_pricing_date = date_1 + datetime.timedelta(days=-20)
liquidity_pricing_date2 = date_1 + datetime.timedelta(days=-120)
stock_data = quandl.get(stock_ticker, start_date=liquidity_pricing_date2, end_date=liquidity_pricing_date)
p = np.array(stock_data['Adj_Close'])
returns = np.array(stock_data['Adj_Close'].pct_change())
dollar_volume = np.array(stock_data['Adj_Volume'] * p)
illiq = (np.divide(returns, dollar_volume))
print(np.nanmean(illiq))
illiquidity_measure = np.nanmean(illiq, dtype=float) * (10 ** 6) # multiply by 10^6 for expositional purposes
return [stock_vola, stock_price_average, illiquidity_measure]
I then use a seperate script to select my csv file with the list with rows, each row containing the issue_date, stock_ticker
import function
import csv
import tkinter as tk
from tkinter import filedialog
# Open File Dialog
root = tk.Tk()
root.withdraw()
file_path = filedialog.askopenfilename()
# Load Spreadsheet data
f = open(file_path)
csv_f = csv.reader(f)
next(csv_f)
result_data = []
# Iterate
for row in csv_f:
try:
return_data = function.get_data(row[1], row[0])
if len(return_data) != 0:
# print(return_data)
result_data_loc = [row[1], row[0]]
result_data_loc.extend(return_data)
result_data.append(result_data_loc)
except AttributeError:
print(row[0])
print('\n\n')
print(row[1])
continue
if result_data is not None:
with open('resuls.csv', mode='w', newline='') as result_file:
csv_writer = csv.writer(result_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for result in result_data:
# print(result)
csv_writer.writerow(result)
else:
print("No results found!")
It is quite messy, but like I mentioned before, I am definitely a beginner. Speeding this up would greatly help me.

Daily data of same month over years

I have data from the same month over period of time and I trying to plot the mean by day of the motnh but I don´t know how to do it.
This is how the dataframe looks like
The main code to get the dataframe:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
import matplotlib.pyplot as plt
from datetime import date, timedelta
from datetime import datetime
inicio = date(1973, 1, 1)
#inicio = date(2019, 2, 15)
#final = date(2000, 10, 10)
final = date(1974, 3, 1)
delta = timedelta(days=1)
años=[]
links=[]
while inicio <= final:
fechas=inicio.strftime("%Y-%m-%d")
#años.append(datetime.strptime(fechas, '%Y-%m-%d').date())
años.append(fechas)
url='http://weather.uwyo.edu/cgi-bin/sounding?region=samer&TYPE=TEXT%3ALIST&YEAR={}&MONTH={}&FROM={}12&TO={}12&STNM=80222'.format(fechas[0:4],fechas[5:7],fechas[8:10],fechas[8:10])
links.append(url)
inicio += delta
d = dict(zip(años, links))
df1=pd.DataFrame(list(d.items()), columns=['Fecha', 'url'])
df1.set_index('Fecha', inplace=True)
Enero=pd.DataFrame()
Febrero=pd.DataFrame()
for i in df1.index:
if i[5:7]=='01':
Enero = Enero.append(df1.loc[i], ignore_index=False)
elif i[5:7]=='02':
Febrero = Febrero.append(df1.loc[i], ignore_index=False)
labels = ['PRES', 'HGHT', 'TEMP', 'DWPT', 'RELH', 'MIXR', 'DRCT', 'SKNT', 'THTA', 'THTE', 'THTV']
def reques(url):
try:
results = []
peticion=requests.get(url)
soup=bs(peticion.content, 'lxml')
pre = (soup.select_one('pre')).text
for line in pre.split('\n')[4:-1]:
#print (line)
if '--' not in line:
row = [line[i:i+7].strip() for i in range(0, len(line), 7)]
results.append(row)
else:
pass
df5=pd.DataFrame.from_records(results, columns=labels)
#return x
return df5
except AttributeError:
pass
SuperDF = pd.DataFrame()
SuperDF = pd.DataFrame(columns=labels)
startTime = datetime.now()
sin_datos=[]
for i in Febrero['url']:
try:
x=reques(i)
df2=x
y=str(df1[df1['url']==i].index.values)
df2.index = [y] * len(x)
SuperDF=SuperDF.append(x)
except TypeError:
sin_datos.append(df1[df1['url']==i].index.values)
print (df1[df1['url']==i].index.values)
SuperDF.index= SuperDF.index.map(lambda x: x.lstrip("['").rstrip("]''"))
SuperDF.index = pd.to_datetime(SuperDF.index)
SuperDF=SuperDF.apply(pd.to_numeric)
SuperDF
I've been trying to do it whit this
import seaborn as sns
SuperDF = SuperDF[(SuperDF['TEMP']==0)]
ax = SuperDF.loc['02', 'RELH'].plot(marker='o', linestyle='-')
ax.set_ylabel('RELH');
but I got this error
KeyError: '02'
It works when i pass the year but i need the mean by day for the month. Any help will be appreciate.
This is what I need

Import and parse .data file

there is a file I tried to import and safe as pandas df. At a first sight looks like it's already columns and rows ordered, but finally I had to do a bunch of stuff to create pandas df. Could you please check if there is much faster way to manage it?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
My way of doing it is:
import requests
import pandas as pd
r = requests.get(url)
file = r.text
step_1 = file.split('\n')
for n in range(len(step_1)): # remove empty strings
if bool(step_1[n]) == False:
del(step_1[n])
step_2 = [i.split('\t') for i in step_1]
cars_names = [i[1] for i in step_2]
step_3 = [i[0].split(' ') for i in step_2]
for e in range(len(step_3)): # remove empty strings in each sublist
step_3[e] = [item for item in step_3[e] if item != '']
mpg = [i[0] for i in step_3]
cylinders = [i[1] for i in step_3]
disp = [i[2] for i in step_3]
horsepower = [i[3] for i in step_3]
weight = [i[4] for i in step_3]
acce = [i[5] for i in step_3]
year = [i[6] for i in step_3]
origin = [i[7] for i in step_3]
list_cols = [cars_names, mpg, cylinders, disp, horsepower, weight, acce, year, origin]
# list_labels written manually:
list_labels = ['car name', 'mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin']
zipped = list(zip(list_labels, list_cols))
data = dict(zipped)
df = pd.DataFrame(data)
When you replaced \t to blankspace, you can use read_csv to read it. But you need to wrap up your text, because the first parameter in read_csv is filepath_or_buffer which needs object with a read() method (such as a file handle or StringIO). Then your question can be transform to read_csv doesn't read the column names correctly on this file?
import requests
import pandas as pd
from io import StringIO
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
r = requests.get(url)
file = r.text.replace("\t"," ")
# list_labels written manually:
list_labels = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin','car name']
df = pd.read_csv(StringIO(file),sep="\s+",header = None,names=list_labels)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(df)

Resources