Improve the speed of for loop over a loaded file - python-3.x

I have a dataset in text file in the following form:
5851F42D00000000,1
4BB5F64640B18CCF,2
742D2F7A0AE16FD9,1
76035E090D1F0796,1
6FA72CA540F7702C,3
.
.
.
The file contains 500K rows. My goal is to read the file and convert the hex values to binary. The following code works fine but it is very slow. Is there a trick to make it faster?
import pandas as pd
import numpy as np
df = pd.read_csv(path+ 'dataset.txt', sep=",", header=None)
X = []
y = []
for i, row in df.iterrows():
n = int('{:064b}'.format(int(row.values[0], 16)))
X.append(n)
y.append(row.values[1])
X = np.asarray(X)
y = np.asarray(y)

No need of redundant loop and appending to lists.
Use pandas "magic":
df = pd.read_csv('test.csv', sep=",", header=None)
x = df[0].apply(lambda x: int('{:064b}'.format(int(x, 16)))).to_numpy()
y = df[1].to_numpy()
print(x, y)

Related

Get id each row in dataframe

I have done this code. according to my code, I have done(x1-x).The 'outlier_reference_x' is the difference between (x1-x). Now How can I know which x1 and x were?,I want to know which x and x1 were that subtracted of them is less than (3* sd)?
Actully I want to what iloc, 'outlier_reference_x' referes in df_reference and df_test? and then I want to delete these row.
sorry for my language.
import pandas as pd
df_reference = pd.read_csv(
"reference.txt",
delim_whitespace=True, # any whitespace separates data
names=["x", "y"], # column names
index_col=False # no index
)
df_test = pd.read_csv(
"test.txt",
delim_whitespace=True, # any whitespace separates data
names=["x1", "y1"], # column names
index_col=False # no index
)
frames = [df_reference ,df_test]
df = pd.concat(frames, axis=1)
df.to_csv('dataset.txt', sep='\t', header=True)
df_ = df[['x','x1']].copy()
df_['x1-x']=df_['x1']-df_['x']
set_mean_X = df_.loc[:, 'x1-x'].mean()
set_std_X = df_.loc[:, 'x1-x'].std()
outlier_reference_x = [x for x in df_['x1-x'] if ( x > 3 * set_std_X)]

Reading in multiple files in Python and saving them one by one in a different directory

import glob
import pandas as pd
import seaborn as sns
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
files = glob.glob("Angular_position_*_*.csv")
output = pd.DataFrame()
for f in files:
df = pd.read_csv(f)
time = df.iloc[:,0]
time = time.to_numpy()
ynew = df.iloc[:,1:]
ynew = ynew.to_numpy()
lowPassCutoffFreq = 6.0 # Cut off frequency
Sample_freq = 150; #Target sample frequency
N = 2 # Order of the filter; In this case 2nd order
Wn = lowPassCutoffFreq/(Sample_freq/2) #Normalize frequency
b, a = signal.butter(5, Wn, btype='low',analog=False,output='ba')
#scipy.signal.butter(N, Wn, btype='low', analog=False, output='ba', fs=None)
output = signal.filtfilt(b, a, ynew, axis=0)
np.savetxt("enter directory path/Filtered_files/Filtered_Angular_position_*_*", output, delimiter = ', ', newline = "\n")
I am trying to read in all files in a directory, they are then low pass filtered. After that the results are saved one after the other but not in one file. The result gives each files with 3 columns and ideally I would like them to named with headers e.g. col1, col2, col3.
Without using glob, I can filter all my files individually but I have more than 100 such files.
Any help would be appreciated.
best wishes,
I have partially solved the issue apart from the header names:
import glob
import pandas as pd
from tnorma import tnorma
import seaborn as sns
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
path = r'location_of_dir'
all_files = glob.glob(path + '/*.csv')
# yn = np.zeros(shape = (101,1))
# tn = np.zeros(shape = (101,1))
#ynew = []
yn = np.zeros(shape = (101,1))
for filename in all_files:
df = pd.read_csv(filename, index_col=None, header=0)
print(filename)
foo = filename.split("/")[-1]
#df = pd.read_csv(f)
time = df.iloc[:,0]
time = time.to_numpy()
ynew = df.iloc[:,1:]
ynew = ynew.to_numpy()
#print(ynew)
lowPassCutoffFreq = 6.0 # Cut off frequency
Sample_freq = 150; #Target sample frequency
N = 2 # Order of the filter; In this case 2nd order
Wn = lowPassCutoffFreq/(Sample_freq/2) #Normalize frequency
b, a = signal.butter(5, Wn, btype='low',analog=False,output='ba')
#scipy.signal.butter(N, Wn, btype='low', analog=False, output='ba', fs=None)
output = signal.filtfilt(b, a, ynew, axis=0)
#print (output)
tn = np.linspace(0, 100, 101) # new time vector for the new time-normalized data
yn, tn, indie = tnorma(output, k=3, smooth =1, mask = None, show = False)
np.savetxt("path_name/foldername/file"+ foo, yn, delimiter = ', ', newline = "\n")
However, I am having difficulty in putting header names on the 3 columns per file.

Extract Pixels from a pmg file and convert them into a pandas data frame

I have a directory that has subdirectories each with a bunch of PMG files, I would like to extract the pixels from each image and put them in a pandas data frame.
from PIL import Image
import os
import pandas as pd
import numpy as np
dirs = [r"D:\MSIT\Machine Learning\IMG"+"\\s"+str(i) for i in range(1,41)]
pixels = list()
df = pd.DataFrame(columns = ["f" + str(i) for i in range(1,10305)])
cols = list(df.columns)
for directory in dirs:
for filename in os.listdir(directory):
im = Image.open(directory + "\\" +filename)
dims = (list(im.getdata()))
df2 = pd.Series(dims)
pixels.append(dims)
k = 1
for i in pixels:
for j in i:
df2 = pd.Series(j)
df.append(df2, ignore_index = True)
print(str(k) + "Done")
k += 1
print(df.head())
df.to_csv('pixel_data.csv')
I'm assuming you want the pixel values of the PMG files to be your features. You can use df.loc to use indexing in a DataFrame and to add your data in a row after row fashion. Also, using numpy would make the process a little bit faster.
import pandas as pd
from PIL import Image
import os
import numpy as np
columns = [i for i in range(10304)]
columns.append('Label')
df = pd.DataFrame(columns=columns)
rows = 0
for direc in os.listdir():
if direc.startswith('s'):
print('Adding ' + direc)
print('--------------')
for file in os.listdir('./' + direc):
im = Image.open('./' + direc + '/' + file)
x = np.array(im.getdata())
x = x.tolist()
x.append(int(direc.replace('s', '')))
df.loc[rows] = x
rows += 1
df.to_csv('Dataset.csv')

How to read from CSV file

I am trying to understand how Kalman Filter for non-linear system works. While searching for an example, I cam across this good basic example.
import numpy as np
import pylab as pl
import pandas as pd
from pykalman import UnscentedKalmanFilter
# initialize parameters
def transition_function(state, noise):
a = np.sin(state[0]) + state[1] * noise[0]
b = state[1] + noise[1]
return np.array([a, b])
def observation_function(state, noise):
C = np.array([[-1, 0.5], [0.2, 0.1]])
return np.dot(C, state) + noise
transition_covariance = np.eye(2)
random_state = np.random.RandomState(0)
observation_covariance = np.eye(2) + random_state.randn(2, 2) * 0.1
initial_state_mean = [0, 0]
initial_state_covariance = [[1, 0.1], [-0.1, 1]]
# sample from model
kf = UnscentedKalmanFilter(
transition_function, observation_function,
transition_covariance, observation_covariance,
initial_state_mean, initial_state_covariance,
random_state=random_state
)
states, observations = kf.sample(50, initial_state_mean)
# estimate state with filtering and smoothing
filtered_state_estimates = kf.filter(observations)[0]
smoothed_state_estimates = kf.smooth(observations)[0]
# draw estimates
pl.figure()
lines_true = pl.plot(states, color='b')
lines_filt = pl.plot(filtered_state_estimates, color='r', ls='-')
lines_smooth = pl.plot(smoothed_state_estimates, color='g', ls='-.')
pl.legend((lines_true[0], lines_filt[0], lines_smooth[0]),
('true', 'filt', 'smooth'),
loc='lower left'
)
pl.show()
This code produces the following graph.
However,for my experiment - I have created a very small time series data ready with three columns formatted as follows. The full dataset is attached here for reproduciability.
time X Y
0.040662 1.041667 1
0.139757 1.760417 2
0.144357 1.190104 1
0.145341 1.047526 1
0.145401 1.011882 1
0.148465 1.002970 1
.... ..... .
Instead of using the random values as shown in the code, how can we input from the CSV file I attached? Here is my approach, but it doesn't seem to workout for me and I would appreciate for any help.
df = pd.read_csv('testdata.csv')
pd.set_option('use_inf_as_null', True)
df.dropna(inplace=True)
X = df.drop('Y', axis=1)
y = df['Y']
d1= np.array(X)
d2 = np.array(y)
From the link I shared, here is how you get the CSV data into Numpy Arrays.
import numpy as np
import csv
with open('testdata.csv','r') as csvfile:
r = csv.reader(csvfile, delimiter=',')
data = [i for i in r]
headings = data.pop(0)
data = np.array([[np.float(j) for j in i] for i in data])
T = data.T[0] #Time
X = data.T[1] #X
Y = data.T[2] #Y
print(T)
print(X)
print(Y)

How can I make my code call each file in the correct sequence?

I have a folder with 38 files. The names are like this:
AWA_s1_features.mat, AWA_s2_features.mat......AWA_s38_features.mat
Each file is an array with 28 columns but with different # of rows. For example: AWA_s1_features.mat = (139,28), AWA_s2_features.mat = (199, 28) and so on.
As I am doing machine learning I need to join all these files in 1 huge array and label each row. So for the 139 rows of AWA_s1_features.mat there must be 139 1s; for AWA_s2_features.mat there must be 199 2s, and so on until AWA_s38_features.mat which must have a # of 38s.
This is what I mean:
I wrote some code. But I have found that the files are not called in order and therefore the labeling is wrong. For example, AWA_s1_features.mat is not the first file to be called and it has been labeled as 11. AWA_s2_features.mat has been labeled as 21.
So how can I improve my code so that it calls each file in the correct sequence?
Here is the code:
import numpy as np
import scipy.io as sio
import glob
read_files = glob.glob('I:/2D/Features 2D/AWA_s*.mat')
x = np.array([])
y = np.array([])
q = 1
for f in read_files:
l=sio.loadmat(f)['features']
x = np.concatenate((x, l), axis=0) if x.size else l
y_temp = q*np.ones((l.shape[0],1))
y = np.concatenate((y, y_temp), axis=0) if y.size else y_temp
q = q + 1
sio.savemat('AWA_FeaturesAll.mat', {'x':x, 'y':y})
The problem is that the default sorting is alphabetical, meaning that "11" comes before "2". You want numerical sorting and one way would be to use the sorted function with a key parameter, like so:
import numpy as np
import scipy.io as sio
import glob
read_files = glob.glob('I:/2D/Features 2D/AWA_s*.mat')
x = np.array([])
y = np.array([])
q = 1
for f in sorted(read_files, key=lambda f: int(f.split('_')[1][1:])):
l=sio.loadmat(f)['features']
x = np.concatenate((x, l), axis=0) if x.size else l
y_temp = q*np.ones((l.shape[0],1))
y = np.concatenate((y, y_temp), axis=0) if y.size else y_temp
q = q + 1
sio.savemat('AWA_FeaturesAll.mat', {'x':x, 'y':y})

Resources