How can I make my code call each file in the correct sequence? - python-3.x

I have a folder with 38 files. The names are like this:
AWA_s1_features.mat, AWA_s2_features.mat......AWA_s38_features.mat
Each file is an array with 28 columns but with different # of rows. For example: AWA_s1_features.mat = (139,28), AWA_s2_features.mat = (199, 28) and so on.
As I am doing machine learning I need to join all these files in 1 huge array and label each row. So for the 139 rows of AWA_s1_features.mat there must be 139 1s; for AWA_s2_features.mat there must be 199 2s, and so on until AWA_s38_features.mat which must have a # of 38s.
This is what I mean:
I wrote some code. But I have found that the files are not called in order and therefore the labeling is wrong. For example, AWA_s1_features.mat is not the first file to be called and it has been labeled as 11. AWA_s2_features.mat has been labeled as 21.
So how can I improve my code so that it calls each file in the correct sequence?
Here is the code:
import numpy as np
import scipy.io as sio
import glob
read_files = glob.glob('I:/2D/Features 2D/AWA_s*.mat')
x = np.array([])
y = np.array([])
q = 1
for f in read_files:
l=sio.loadmat(f)['features']
x = np.concatenate((x, l), axis=0) if x.size else l
y_temp = q*np.ones((l.shape[0],1))
y = np.concatenate((y, y_temp), axis=0) if y.size else y_temp
q = q + 1
sio.savemat('AWA_FeaturesAll.mat', {'x':x, 'y':y})

The problem is that the default sorting is alphabetical, meaning that "11" comes before "2". You want numerical sorting and one way would be to use the sorted function with a key parameter, like so:
import numpy as np
import scipy.io as sio
import glob
read_files = glob.glob('I:/2D/Features 2D/AWA_s*.mat')
x = np.array([])
y = np.array([])
q = 1
for f in sorted(read_files, key=lambda f: int(f.split('_')[1][1:])):
l=sio.loadmat(f)['features']
x = np.concatenate((x, l), axis=0) if x.size else l
y_temp = q*np.ones((l.shape[0],1))
y = np.concatenate((y, y_temp), axis=0) if y.size else y_temp
q = q + 1
sio.savemat('AWA_FeaturesAll.mat', {'x':x, 'y':y})

Related

How can I interpolate values from two lists (in Python)?

I am relatively new to coding in Python. I have mainly used MatLab in the past and am used to having vectors that can be referenced explicitly rather than appended lists. I have a script where I generate a list of x- and y- (z-, v-, etc) values. Later, I want to interpolate and then print a table of the values at specified points. Here is a MWE. The problem is at line 48:
yq = interp1d(x_list, y_list, xq(nn))#interp1(output1(:,1),output1(:,2),xq(nn))
I'm not sure I have the correct syntax for the last two lines either:
table[nn] = ('%.2f' %xq, '%.2f' %yq)
print(table)
Here is the full script for the MWE:
#This script was written to test how to interpolate after data was created in a loop and stored as a list. Can a list be accessed explicitly like a vector in matlab?
#
from scipy.interpolate import interp1d
from math import * #for ceil
from astropy.table import Table #for Table
import numpy as np
# define the initial conditions
x = 0 # initial x position
y = 0 # initial y position
Rmax = 10 # maxium range
""" initializing variables for plots"""
x_list = [x]
y_list = [y]
""" define functions"""
# not necessary for this MWE
"""create sample data for MWE"""
# x and y data are calculated using functions and appended to their respective lists
h = 1
t = 0
tf = 10
N=ceil(tf/h)
# Example of interpolation without a loop: https://docs.scipy.org/doc/scipy/tutorial/interpolate.html#d-interpolation-interp1d
#x = np.linspace(0, 10, num=11, endpoint=True)
#y = np.cos(-x**2/9.0)
#f = interp1d(x, y)
for i in range(N):
x = h*i
y = cos(-x**2/9.0)
""" appends selected data for ability to plot"""
x_list.append(x)
y_list.append(y)
## Interpolation after x- and y-lists are already created
intervals = 0.5
nfinal = ceil(Rmax/intervals)
NN = nfinal+1 # length of table
dtype = [('Range (units?)', 'f8'), ('Drop? (units)', 'f8')]
table = Table(data=np.zeros(N, dtype=dtype))
for nn in range(NN):#for nn = 1:NN
xq = 0.0 + (nn-1)*intervals #0.0 + (nn-1)*intervals
yq = interp1d(x_list, y_list, xq(nn))#interp1(output1(:,1),output1(:,2),xq(nn))
table[nn] = ('%.2f' %xq, '%.2f' %yq)
print(table)
Your help and patience will be greatly appreciated!
Best regards,
Alex
Your code has some glaring issues that made it really difficult to understand. Let's first take a look at some things I needed to fix:
for i in range(N):
x = h*1
y = cos(-x**2/9.0)
""" appends selected data for ability to plot"""
x_list.append(x)
y_list.append(y)
You are appending a single value without modifying it. What I presume you wanted is down below.
intervals = 0.5
nfinal = ceil(Rmax/intervals)
NN = nfinal+1 # length of table
dtype = [('Range (units?)', 'f8'), ('Drop? (units)', 'f8')]
table = Table(data=np.zeros(N, dtype=dtype))
for nn in range(NN):#for nn = 1:NN
xq = 0.0 + (nn-1)*intervals #0.0 + (nn-1)*intervals
yq = interp1d(x_list, y_list, xq(nn))#interp1(output1(:,1),output1(:,2),xq(nn))
table[nn] = ('%.2f' %xq, '%.2f' %yq)
This is where things get strange. First: use pandas tables, this is the more popular choice. Second: I have no idea what you are trying to loop over. What I presume you wanted was to vary the number of points for the interpolation, which I have done so below. Third: you are trying to interpolate a point, when you probably want to interpolate over a range of points (...interpolation). Lastly, you are using the interp1d function incorrectly. Please take a look at the code below or run it here; let me know what you exactly wanted (specifically: what should xq / xq(nn) be?), because the MRE you provided is quite confusing.
from scipy.interpolate import interp1d
from math import *
import numpy as np
Rmax = 10
h = 1
t = 0
tf = 10
N = ceil(tf/h)
x = np.arange(0,N+1)
y = np.cos(-x**2/9.0)
interval = 0.5
NN = ceil(Rmax/interval) + 1
ip_list = np.arange(1,interval*NN,interval)
xtable = []
ytable = []
for i,nn in enumerate(ip_list):
f = interp1d(x,y)
x_i = np.arange(0,nn+interval,interval)
xtable += [x_i]
ytable += [f(x_i)]
[print(i) for i in xtable]
[print(i) for i in ytable]

Improve the speed of for loop over a loaded file

I have a dataset in text file in the following form:
5851F42D00000000,1
4BB5F64640B18CCF,2
742D2F7A0AE16FD9,1
76035E090D1F0796,1
6FA72CA540F7702C,3
.
.
.
The file contains 500K rows. My goal is to read the file and convert the hex values to binary. The following code works fine but it is very slow. Is there a trick to make it faster?
import pandas as pd
import numpy as np
df = pd.read_csv(path+ 'dataset.txt', sep=",", header=None)
X = []
y = []
for i, row in df.iterrows():
n = int('{:064b}'.format(int(row.values[0], 16)))
X.append(n)
y.append(row.values[1])
X = np.asarray(X)
y = np.asarray(y)
No need of redundant loop and appending to lists.
Use pandas "magic":
df = pd.read_csv('test.csv', sep=",", header=None)
x = df[0].apply(lambda x: int('{:064b}'.format(int(x, 16)))).to_numpy()
y = df[1].to_numpy()
print(x, y)

can't convert expression to float problem

i am trying to use the "subs" function for differential equation
but i get the error: "can't convert expression to float"
i tryed to check the type of the arrays, but they all float
import sympy as sym
from sympy.integrals import inverse_laplace_transform
from sympy.abc import s,t,y
import numpy as np
U = 1
G =(s+1)/(s*(s+2))
Y = G*U
y = inverse_laplace_transform(Y, s, t)
tm = np.linspace(0,2,3)
y_val = np.zeros(len(tm))
for i in range(len(tm)):
y_val[i] = y.subs(t, tm[i])
print(y)
print(y_val)
line 17
y_val[i] = y.subs(t, tm[i])
TypeError: can't convert expression to float
Ths issue here is that, because tm[0] == 0, the evaluated y in the first iteration of your loop is Heaviside(0), which has no defined real value by default (see https://docs.sympy.org/latest/modules/functions/special.html#heaviside). This is because you have
from sympy.functions import exp, Heaviside
assert y == Heaviside(t) / 2 + exp(-2 * t) * Heaviside(t) / 2
The simplest workaround here is defining a linear space excluding 0, for instance
epsilon = 1e-15
tm = np.linspace(epsilon, 2, 3)
Using y_val = np.zeros(len(tm)), the default datatype of array is float. After modifying the code, you find that one of y_val elements is an object, not float. You can use a list object as a placeholder or you can specify the datatype of numpy array as object:
import sympy as sym
from sympy.integrals import inverse_laplace_transform
from sympy.abc import s,t,y
import numpy as np
U = 1
G =(s+1)/(s*(s+2))
Y = G*U
y = inverse_laplace_transform(Y, s, t)
tm = np.linspace(0,2,3)
# y_val = [0 for _ in range(len(tm))]
y_val = np.zeros(len(tm), dtype=object)
for i in range(len(tm)):
y_val[i] = y.subs(t, tm[i])
print(y_val)
result: [Heaviside(0.0) 0.567667641618306 0.509157819444367]
I have similar problem and your answers work for me, but I still need to put the data into graph.. I modified my problem for this question:
import sympy as sym
from sympy.integrals import inverse_laplace_transform
from sympy.abc import s,t,y
import numpy as np
import matplotlib.pyplot as plt
Y = (5*(1 - 5*s))/(s*(4*(s**2) + s + 1))*(1/s)
y = inverse_laplace_transform(Y, s, t)
tm = np.linspace(1e-15, 20, 100)
y_val = np.zeros(len(tm), dtype=object)
for i in range(len(tm)):
y_val[i] = y.subs(t, tm[i])
plt.plot(y_val, tm)
plt.show()
Running this code I got same error:
TypeError: can't convert expression to float

How to read from CSV file

I am trying to understand how Kalman Filter for non-linear system works. While searching for an example, I cam across this good basic example.
import numpy as np
import pylab as pl
import pandas as pd
from pykalman import UnscentedKalmanFilter
# initialize parameters
def transition_function(state, noise):
a = np.sin(state[0]) + state[1] * noise[0]
b = state[1] + noise[1]
return np.array([a, b])
def observation_function(state, noise):
C = np.array([[-1, 0.5], [0.2, 0.1]])
return np.dot(C, state) + noise
transition_covariance = np.eye(2)
random_state = np.random.RandomState(0)
observation_covariance = np.eye(2) + random_state.randn(2, 2) * 0.1
initial_state_mean = [0, 0]
initial_state_covariance = [[1, 0.1], [-0.1, 1]]
# sample from model
kf = UnscentedKalmanFilter(
transition_function, observation_function,
transition_covariance, observation_covariance,
initial_state_mean, initial_state_covariance,
random_state=random_state
)
states, observations = kf.sample(50, initial_state_mean)
# estimate state with filtering and smoothing
filtered_state_estimates = kf.filter(observations)[0]
smoothed_state_estimates = kf.smooth(observations)[0]
# draw estimates
pl.figure()
lines_true = pl.plot(states, color='b')
lines_filt = pl.plot(filtered_state_estimates, color='r', ls='-')
lines_smooth = pl.plot(smoothed_state_estimates, color='g', ls='-.')
pl.legend((lines_true[0], lines_filt[0], lines_smooth[0]),
('true', 'filt', 'smooth'),
loc='lower left'
)
pl.show()
This code produces the following graph.
However,for my experiment - I have created a very small time series data ready with three columns formatted as follows. The full dataset is attached here for reproduciability.
time X Y
0.040662 1.041667 1
0.139757 1.760417 2
0.144357 1.190104 1
0.145341 1.047526 1
0.145401 1.011882 1
0.148465 1.002970 1
.... ..... .
Instead of using the random values as shown in the code, how can we input from the CSV file I attached? Here is my approach, but it doesn't seem to workout for me and I would appreciate for any help.
df = pd.read_csv('testdata.csv')
pd.set_option('use_inf_as_null', True)
df.dropna(inplace=True)
X = df.drop('Y', axis=1)
y = df['Y']
d1= np.array(X)
d2 = np.array(y)
From the link I shared, here is how you get the CSV data into Numpy Arrays.
import numpy as np
import csv
with open('testdata.csv','r') as csvfile:
r = csv.reader(csvfile, delimiter=',')
data = [i for i in r]
headings = data.pop(0)
data = np.array([[np.float(j) for j in i] for i in data])
T = data.T[0] #Time
X = data.T[1] #X
Y = data.T[2] #Y
print(T)
print(X)
print(Y)

Estimating parameters using minimization in Python and speed up this process

I am trying to find parameter estimates using using minimization. The code I wrote works but there are two problems:
I finds only a local minimum. I tried to solve this by using basinhopping.
It takes very long until I get a result and since I have to do this minimization around 1000 times this becomes a big issue.
So my questions are:
Do you know how I could optimize my code so that it runs faster for the minimization.
Is there a way I can change the basinhopping part so that it runs faster? eg. set niter lower or a differnt method im not aware of. I tried running it like this and after 10 hour I didnt get a response for even one of the 1000 individuals for basinhopping.
Is there another way to find a global minimum?
Feel free to ask further questions please.
My code:
import numpy as np
from scipy.optimize import minimize
from scipy.optimize import basinhopping
from scipy.integrate import odeint
import pickle
import os
import pandas as pd
import datetime
import numpy.random as npr
import csv
path = "C:\\Users\Sebastian Gäumann\OneDrive\Dokumente\FS 2017\Bachelorarbeit\Python"
os.chdir(path)
###IDS
df = pd.read_csv('1_Youtuber_SingleNrSheet_Comedy.csv', sep = ";", skipinitialspace=True) ######Change Name
YoutuberID = df["Channel_ID"].tolist()
##print(YoutuberID)
with open("9_p_q_m_Fun_ExtendedBass_VIEWS_Comedy_test.csv", "w" ,newline='',encoding='utf-8') as csv_file2: ######Change Name
csv_writer2 = csv.writer(csv_file2, delimiter=';')
csv_writer2.writerow(["Type","p", "q", "m","Functionvalue"])
count = 0
for ID in YoutuberID[0:]: ###Change
try:
path = "C:\\Users\Sebastian Gäumann\OneDrive\Dokumente\FS 2017\Bachelorarbeit\Python"
os.chdir(path)
###ALL INFO
Days = pd.read_csv('3_API_Call_ALL_info_Comedy_v2.csv', sep = ";", skipinitialspace=True)
views_path = "C:\\Users\Sebastian Gäumann\OneDrive\Dokumente\FS 2017\Bachelorarbeit\Python\Daily_Views_Comedy" ######Change Name
os.chdir(views_path)
SVR = pd.read_csv("4_COMEDY_DailyViews_Clean_" + str(count) + "_" + ID + ".csv", sep = ";", parse_dates=True, dayfirst=True) ######Change Name
## print(SVR[SVR.columns[0]])
SVR = SVR[SVR[SVR.columns[0]]< "2018-05-01"] ####CHANGE DATE FOR DIF CAT
## print(SVR)
#####SV Input
SV = np.array(SVR["Daily Views"])
## print(SV)
Days = Days[Days["channelId"] == ID]
## print(Days)
Days["publishedAt"] = pd.to_datetime(Days.publishedAt)
Days = Days[Days["publishedAt"] > "2015-01-08"] ##"2015-01-10"
## print(Days)
##### Timedelta #####
start_date = pd.to_datetime("2015-06-08")
##print(start_date)
video_upload_day =[]
for video_date in Days["publishedAt"]:
TimeDelta = video_date - start_date
video_upload_day.append(TimeDelta.days)
##print(video_upload_day)
##print(videoT)
nvideos = len(video_upload_day)
ndays = len(SV)
videoT = np.array(video_upload_day)
## print(videoT,nvideos,ndays)
def objective(x):
p = x[0]
q = x[1]
m = x[2]
estimateV = np.zeros( (ndays, nvideos) )
for t in range( ndays ):
for v in range( nvideos ):
if videoT[v] <= t:
estimateV[ t,v ] = p*m + (q-p) * np.sum(estimateV[0:t,v],axis=0) - (q/m) * (np.sum(estimateV[0:t,v],axis=0)**2)
estimateSV = np.sum( estimateV, axis = 1 )
return np.sum( (SV - estimateSV)**2 )
This is the minimization part. I made one for the normal minimization and one for basinhopping and seperated it with ##.
###### MINIMIZATION #######
mguess = round(sum(SV)/(nvideos*2),0)
print(sum(SV),mguess)
x0 = np.array([0.001, 0.01, mguess]) ####Make it less volatile to first guess? Make bigger steps for m?
b1 = (0.00001,0.5)
b2 = (10**4,10**7)
bnds = (b1,b1,b2)
## minimizer_kwargs = dict(method="L-BFGS-B",bounds=bnds)
## res = basinhopping(objective, x0,niter=20, minimizer_kwargs=minimizer_kwargs)
res = minimize(objective, x0,bounds = bnds)
print(res)
csv_writer2.writerow(["COMEDY",res.x[0], res.x[1],res.x[2],res.fun]) ###CHANNGE CAT
print("CURRERNT YOUTUBER IS:",count)
count += 1
except:
print("PROBLEM",count)
count += 1
## print(res,res.x[0],res.x[1],res.x[2],res.fun)

Resources