Reading in multiple files in Python and saving them one by one in a different directory - python-3.x

import glob
import pandas as pd
import seaborn as sns
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
files = glob.glob("Angular_position_*_*.csv")
output = pd.DataFrame()
for f in files:
df = pd.read_csv(f)
time = df.iloc[:,0]
time = time.to_numpy()
ynew = df.iloc[:,1:]
ynew = ynew.to_numpy()
lowPassCutoffFreq = 6.0 # Cut off frequency
Sample_freq = 150; #Target sample frequency
N = 2 # Order of the filter; In this case 2nd order
Wn = lowPassCutoffFreq/(Sample_freq/2) #Normalize frequency
b, a = signal.butter(5, Wn, btype='low',analog=False,output='ba')
#scipy.signal.butter(N, Wn, btype='low', analog=False, output='ba', fs=None)
output = signal.filtfilt(b, a, ynew, axis=0)
np.savetxt("enter directory path/Filtered_files/Filtered_Angular_position_*_*", output, delimiter = ', ', newline = "\n")
I am trying to read in all files in a directory, they are then low pass filtered. After that the results are saved one after the other but not in one file. The result gives each files with 3 columns and ideally I would like them to named with headers e.g. col1, col2, col3.
Without using glob, I can filter all my files individually but I have more than 100 such files.
Any help would be appreciated.
best wishes,

I have partially solved the issue apart from the header names:
import glob
import pandas as pd
from tnorma import tnorma
import seaborn as sns
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
path = r'location_of_dir'
all_files = glob.glob(path + '/*.csv')
# yn = np.zeros(shape = (101,1))
# tn = np.zeros(shape = (101,1))
#ynew = []
yn = np.zeros(shape = (101,1))
for filename in all_files:
df = pd.read_csv(filename, index_col=None, header=0)
print(filename)
foo = filename.split("/")[-1]
#df = pd.read_csv(f)
time = df.iloc[:,0]
time = time.to_numpy()
ynew = df.iloc[:,1:]
ynew = ynew.to_numpy()
#print(ynew)
lowPassCutoffFreq = 6.0 # Cut off frequency
Sample_freq = 150; #Target sample frequency
N = 2 # Order of the filter; In this case 2nd order
Wn = lowPassCutoffFreq/(Sample_freq/2) #Normalize frequency
b, a = signal.butter(5, Wn, btype='low',analog=False,output='ba')
#scipy.signal.butter(N, Wn, btype='low', analog=False, output='ba', fs=None)
output = signal.filtfilt(b, a, ynew, axis=0)
#print (output)
tn = np.linspace(0, 100, 101) # new time vector for the new time-normalized data
yn, tn, indie = tnorma(output, k=3, smooth =1, mask = None, show = False)
np.savetxt("path_name/foldername/file"+ foo, yn, delimiter = ', ', newline = "\n")
However, I am having difficulty in putting header names on the 3 columns per file.

Related

Perform code on multiple files 1 by 1 pandas

Hi I have code I have written to read a .csv file in a folder and add some required columns.
I now want to perform this code on multiple files within the path folder 1 by 1 and save each as a separate df.
My current code is as follows
import pandas as pd
import glob
import os
path = r'C:\Users\jake.jennings.BRONCO\Desktop\GPS Reports\Games\Inputs\2022-03-27 Vs
Cowboys\Test' # use your path
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
frame = pd.read_csv(filename, index_col=None, skiprows=8)
li.append(frame)
frame = pd.concat(li, axis=0, ignore_index=True)
frame['filename'] = os.path.basename
#Add odometer change and turn all accel values to positive
import numpy as np
frame['OdChange'] = frame['Odometer'].diff()
frame['accelpos'] = frame['Acceleration'].abs()
#Add column with OdChange # >5.5m/s
frame["new1"] = np.where(
(frame.Velocity >=5.5),
frame["OdChange"],
'0')
#Add column with accels/decels >2.5m.s.s for AccelDec/min
frame["new2"] = np.where(
(frame.accelpos >=2.5),
frame["accelpos"],
'0')
#Add column with accels/decels >2.5m.s.s for AccelDec/min
frame["new3"] = np.where(
(frame.Acceleration >=2.5),
'1',
'0')
s = frame['new3'].astype(int)
frame['new4'] = s.diff().fillna(s).eq(1).astype(int)
frame['new4']
#m/min peaks
frame['1minOD'] = frame['OdChange'].rolling(window=600, axis=0).sum()
#HSm/min peaks
frame['1minHS'] = frame['new1'].rolling(window=600, axis=0).sum()
#AccImpulse/min
frame['1minImp'] = frame['accelpos'].rolling(window=600, axis=0).mean() *60
#AccDec Peak Count
frame['1minAccCount'] = frame['new4'].rolling(window=600, axis=0).sum()
print (frame)
I am not sure if this is even the best way to do what I am trying to do. Any help would be appreciated!

Trying to plot a rolling corr line chart but Matplot keeps saying to bring in only valid columns?

Im trying to create a rolling corr using matplot but I get the error "select only valid columns before calling the operation. Dropped columns were Index(['time'], dtype='object')
I have dropped that field from my data frame but the error keeps on appearing ?
Is it something to do with my .iloc argument?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import seaborn as sns
import scipy.stats as stats
import json
from datetime import timezone
from datetime import datetime
from pycoingecko import CoinGeckoAPI
pd.options.display.width = 0
def datetime_to_unix(year, month, day):
'''datetime_to_unix(2021, 6, 1) => 1622505600.0'''
dt = datetime(year, month, day)
timestamp = (dt - datetime(1970, 1, 1)).total_seconds()
return timestamp
def unix_to_datetime(unix_time):
'''unix_to_datetime(1622505700)=> ''2021-06-01 12:01am'''''
ts = int(unix_time/1000 if len(str(unix_time)) > 10 else unix_time) # /1000 handles milliseconds
return datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %l:%M%p').lower()
# Initialize the client
cg = CoinGeckoAPI()
# Retrieve looksrare data in USD
result = cg.get_coin_market_chart_range_by_id(
id='looksrare',
vs_currency='usd',
from_timestamp=datetime_to_unix(2022, 1, 11),
to_timestamp=datetime_to_unix(2022, 4, 20)
)
time = [ unix_to_datetime(i[0]) for i in result['prices'] ]
p_array = np.array(result['prices'])
price = p_array[:,1]
v_array = np.array(result['total_volumes'])
volume = v_array[:,1]
df = pd.DataFrame({'time':time, 'price':price,})
df.head(100)
# Retrieve ETH data in USD
result = cg.get_coin_market_chart_range_by_id(
id='ethereum',
vs_currency='usd',
from_timestamp=datetime_to_unix(2022, 1, 11),
to_timestamp=datetime_to_unix(2022, 4, 20)
)
time = [ unix_to_datetime(i[0]) for i in result['prices'] ]
p_array = np.array(result['prices'])
price = p_array[:,1]
v_array = np.array(result['total_volumes'])
volume = v_array[:,1]
df2 = pd.DataFrame({'time':time, 'price':price,})
df2.head(100)
df_cd = pd.merge(df, df2, how='inner', on='time')
df_cd = df_cd.drop('time', 1)
output = df_cd.corr()
output1 = df_cd['price_x'].corr(df_cd['price_y'])
overall_pearson_r = df_cd.corr().iloc[0,1]
print(df_cd)
print(f"Pandas computed Pearson r: {overall_pearson_r}")
r, p = stats.pearsonr(df_cd.dropna()['price_x'], df_cd.dropna()['price_y'])
print(f"Scipy computed Pearson r: {r} and p-value: {p}")
# compute rolling window synchrony
f,ax=plt.subplots(figsize=(7,3))
df.rolling(window=30,center=True).median().plot(ax=ax)
ax.set(xlabel='Time',ylabel='Pearson r')
ax.set(title=f"Overall Pearson r = {np.round(overall_pearson_r,2)}");

Apply function on a Pandas Dataframe

Apply function on a Pandas Dataframe
I have a code (C01) that calculates the moving averages (21 periods) of a given stock (individual) on the stock exchange (IBOV - B3-BRAZIL). Then I created a for loop where it determines that an asset is in an upward trend after 6 highs followed by moving averages (hypothesis, considering that there are more variables to determine this).
However, I want to do this loop for more than one asset, in this case C02, that is, it applies a function in each column of my code and returns only the name of the assets that are in an upward trend (in this case, the column name). I tried to turn the for loop into a function and apply that function using the pandas 'apply' to each column (axis = 1, I tried tbm axis = 'columns'). But I'm having an error creating the function. When I execute the function using apply, the message "ValueError: Lengths must match to compare" appears. How can I fix this?
Grateful for the attention.
import numpy as np
import pandas as pd
from pandas_datareader import data as wb
from mpl_finance import candlestick_ohlc
from pandas_datareader import data as wb
from datetime import datetime
import matplotlib.dates as mpl_dates
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
#STOCK
ativo = 'WEGE3.SA'
acao2 = ativo.upper()
#START AND END ANALYSIS
inicio = '2020-1-1'
fim = '2021-1-27'
#MAKE DATAFRAME
df00 = wb.DataReader(acao2, data_source='yahoo', start=inicio, end=fim)
df00.index.names = ['Data']
df= df00.copy(deep=True)
df['Data'] = df.index.map(mdates.date2num)
# MOVING AVERAGE
df['ema21'] = df['Close'].ewm(span=21, adjust=False).mean()
df['ema72'] = df['Close'].ewm(span=72, adjust=False).mean()
#DF PLOT
df1=df
df2=df[-120:]
#TREND RULE
alta=1
for i in range(6):
if(df2.ema21[-i-1] < df2.ema21[-i-2]):
alta=0
baixa=1
for i in range(6):
if(df2.ema21[-i-1] > df2.ema21[-i-2]):
baixa=0
if (alta==1 and baixa==0):
a1 = ativo.upper()+ ' HIGH TREND'
elif (alta==0 and baixa==1):
a1 = ativo.upper()+ ' LOW TREND!'
else:
a1 = ativo.upper()+ ' UNDEFINED'
#PLOT RESULTS
print("---------------------------------------")
print(a1)
print("---------------------------------------")
ohlc = df[['Data', 'Open', 'High', 'Low', 'Close']]
f1, ax = plt.subplots(figsize=(14, 8))
# plot the candlesticks
candlestick_ohlc(ax, ohlc.values, width=.6, colorup='green', colordown='red')
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
label_ = acao2.upper() + ' EMA26'
label_2 = acao2.upper() + ' EMA09'
ax.plot(df.index, df1['ema21'], color='black', label=label_)
ax.plot(df.index, df1['ema72'], color='blue', label=label_)
ax.grid(False)
ax.legend()
ax.grid(True)
plt.title(acao2.upper() + ' : Gráfico Diário')
plt.show(block=True)
#C02
#START/END ANALISYS
inicio = '2020-1-1'
fim = '2021-1-27'
#STOCKS
ativos = ['SAPR11.SA','WEGE3.SA']
#DATAFRAME
mydata = pd.DataFrame()
for t in ativos:
mydata[t] = wb.DataReader(t, data_source='yahoo', start=inicio, end=fim)['Close']
df2 = mydata
#MOVING AVERAGE
df3 = df2.apply(lambda x: x.rolling(window=21).mean())
#MAKE FUNCTION
def trend(x):
tendencia_alta=1
for i in range(6):
if(df3.columns[-i-1:] > df3.columns[-i-2:]):
tendencia_alta=0
print()
if (alta==1 and baixa==0):
a1 = ativo.upper()+ ' HIGH TREND'
elif (alta==0 and baixa==1):
a1 = ativo.upper()+ ' LOW TREND!'
else:
a1 = ativo.upper()+ ' UNDEFINED'
#TRYING TO APPLY THE FUNCTION IN EVERY DF3 COLUMN
df3.apply(trend, axis=1)´´´
something like:
def myfunc(x):
#do things here where x is the group of rows sent to function
#instead of df['column'], you'll use x['column']
#because you are passing the rows into x
return x
df.groupby('yourcolumn').apply(myfunc)

Extract Pixels from a pmg file and convert them into a pandas data frame

I have a directory that has subdirectories each with a bunch of PMG files, I would like to extract the pixels from each image and put them in a pandas data frame.
from PIL import Image
import os
import pandas as pd
import numpy as np
dirs = [r"D:\MSIT\Machine Learning\IMG"+"\\s"+str(i) for i in range(1,41)]
pixels = list()
df = pd.DataFrame(columns = ["f" + str(i) for i in range(1,10305)])
cols = list(df.columns)
for directory in dirs:
for filename in os.listdir(directory):
im = Image.open(directory + "\\" +filename)
dims = (list(im.getdata()))
df2 = pd.Series(dims)
pixels.append(dims)
k = 1
for i in pixels:
for j in i:
df2 = pd.Series(j)
df.append(df2, ignore_index = True)
print(str(k) + "Done")
k += 1
print(df.head())
df.to_csv('pixel_data.csv')
I'm assuming you want the pixel values of the PMG files to be your features. You can use df.loc to use indexing in a DataFrame and to add your data in a row after row fashion. Also, using numpy would make the process a little bit faster.
import pandas as pd
from PIL import Image
import os
import numpy as np
columns = [i for i in range(10304)]
columns.append('Label')
df = pd.DataFrame(columns=columns)
rows = 0
for direc in os.listdir():
if direc.startswith('s'):
print('Adding ' + direc)
print('--------------')
for file in os.listdir('./' + direc):
im = Image.open('./' + direc + '/' + file)
x = np.array(im.getdata())
x = x.tolist()
x.append(int(direc.replace('s', '')))
df.loc[rows] = x
rows += 1
df.to_csv('Dataset.csv')

How to read from CSV file

I am trying to understand how Kalman Filter for non-linear system works. While searching for an example, I cam across this good basic example.
import numpy as np
import pylab as pl
import pandas as pd
from pykalman import UnscentedKalmanFilter
# initialize parameters
def transition_function(state, noise):
a = np.sin(state[0]) + state[1] * noise[0]
b = state[1] + noise[1]
return np.array([a, b])
def observation_function(state, noise):
C = np.array([[-1, 0.5], [0.2, 0.1]])
return np.dot(C, state) + noise
transition_covariance = np.eye(2)
random_state = np.random.RandomState(0)
observation_covariance = np.eye(2) + random_state.randn(2, 2) * 0.1
initial_state_mean = [0, 0]
initial_state_covariance = [[1, 0.1], [-0.1, 1]]
# sample from model
kf = UnscentedKalmanFilter(
transition_function, observation_function,
transition_covariance, observation_covariance,
initial_state_mean, initial_state_covariance,
random_state=random_state
)
states, observations = kf.sample(50, initial_state_mean)
# estimate state with filtering and smoothing
filtered_state_estimates = kf.filter(observations)[0]
smoothed_state_estimates = kf.smooth(observations)[0]
# draw estimates
pl.figure()
lines_true = pl.plot(states, color='b')
lines_filt = pl.plot(filtered_state_estimates, color='r', ls='-')
lines_smooth = pl.plot(smoothed_state_estimates, color='g', ls='-.')
pl.legend((lines_true[0], lines_filt[0], lines_smooth[0]),
('true', 'filt', 'smooth'),
loc='lower left'
)
pl.show()
This code produces the following graph.
However,for my experiment - I have created a very small time series data ready with three columns formatted as follows. The full dataset is attached here for reproduciability.
time X Y
0.040662 1.041667 1
0.139757 1.760417 2
0.144357 1.190104 1
0.145341 1.047526 1
0.145401 1.011882 1
0.148465 1.002970 1
.... ..... .
Instead of using the random values as shown in the code, how can we input from the CSV file I attached? Here is my approach, but it doesn't seem to workout for me and I would appreciate for any help.
df = pd.read_csv('testdata.csv')
pd.set_option('use_inf_as_null', True)
df.dropna(inplace=True)
X = df.drop('Y', axis=1)
y = df['Y']
d1= np.array(X)
d2 = np.array(y)
From the link I shared, here is how you get the CSV data into Numpy Arrays.
import numpy as np
import csv
with open('testdata.csv','r') as csvfile:
r = csv.reader(csvfile, delimiter=',')
data = [i for i in r]
headings = data.pop(0)
data = np.array([[np.float(j) for j in i] for i in data])
T = data.T[0] #Time
X = data.T[1] #X
Y = data.T[2] #Y
print(T)
print(X)
print(Y)

Resources