Extract Pixels from a pmg file and convert them into a pandas data frame - python-3.x

I have a directory that has subdirectories each with a bunch of PMG files, I would like to extract the pixels from each image and put them in a pandas data frame.
from PIL import Image
import os
import pandas as pd
import numpy as np
dirs = [r"D:\MSIT\Machine Learning\IMG"+"\\s"+str(i) for i in range(1,41)]
pixels = list()
df = pd.DataFrame(columns = ["f" + str(i) for i in range(1,10305)])
cols = list(df.columns)
for directory in dirs:
for filename in os.listdir(directory):
im = Image.open(directory + "\\" +filename)
dims = (list(im.getdata()))
df2 = pd.Series(dims)
pixels.append(dims)
k = 1
for i in pixels:
for j in i:
df2 = pd.Series(j)
df.append(df2, ignore_index = True)
print(str(k) + "Done")
k += 1
print(df.head())
df.to_csv('pixel_data.csv')

I'm assuming you want the pixel values of the PMG files to be your features. You can use df.loc to use indexing in a DataFrame and to add your data in a row after row fashion. Also, using numpy would make the process a little bit faster.
import pandas as pd
from PIL import Image
import os
import numpy as np
columns = [i for i in range(10304)]
columns.append('Label')
df = pd.DataFrame(columns=columns)
rows = 0
for direc in os.listdir():
if direc.startswith('s'):
print('Adding ' + direc)
print('--------------')
for file in os.listdir('./' + direc):
im = Image.open('./' + direc + '/' + file)
x = np.array(im.getdata())
x = x.tolist()
x.append(int(direc.replace('s', '')))
df.loc[rows] = x
rows += 1
df.to_csv('Dataset.csv')

Related

Perform code on multiple files 1 by 1 pandas

Hi I have code I have written to read a .csv file in a folder and add some required columns.
I now want to perform this code on multiple files within the path folder 1 by 1 and save each as a separate df.
My current code is as follows
import pandas as pd
import glob
import os
path = r'C:\Users\jake.jennings.BRONCO\Desktop\GPS Reports\Games\Inputs\2022-03-27 Vs
Cowboys\Test' # use your path
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
frame = pd.read_csv(filename, index_col=None, skiprows=8)
li.append(frame)
frame = pd.concat(li, axis=0, ignore_index=True)
frame['filename'] = os.path.basename
#Add odometer change and turn all accel values to positive
import numpy as np
frame['OdChange'] = frame['Odometer'].diff()
frame['accelpos'] = frame['Acceleration'].abs()
#Add column with OdChange # >5.5m/s
frame["new1"] = np.where(
(frame.Velocity >=5.5),
frame["OdChange"],
'0')
#Add column with accels/decels >2.5m.s.s for AccelDec/min
frame["new2"] = np.where(
(frame.accelpos >=2.5),
frame["accelpos"],
'0')
#Add column with accels/decels >2.5m.s.s for AccelDec/min
frame["new3"] = np.where(
(frame.Acceleration >=2.5),
'1',
'0')
s = frame['new3'].astype(int)
frame['new4'] = s.diff().fillna(s).eq(1).astype(int)
frame['new4']
#m/min peaks
frame['1minOD'] = frame['OdChange'].rolling(window=600, axis=0).sum()
#HSm/min peaks
frame['1minHS'] = frame['new1'].rolling(window=600, axis=0).sum()
#AccImpulse/min
frame['1minImp'] = frame['accelpos'].rolling(window=600, axis=0).mean() *60
#AccDec Peak Count
frame['1minAccCount'] = frame['new4'].rolling(window=600, axis=0).sum()
print (frame)
I am not sure if this is even the best way to do what I am trying to do. Any help would be appreciated!

Reading in multiple files in Python and saving them one by one in a different directory

import glob
import pandas as pd
import seaborn as sns
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
files = glob.glob("Angular_position_*_*.csv")
output = pd.DataFrame()
for f in files:
df = pd.read_csv(f)
time = df.iloc[:,0]
time = time.to_numpy()
ynew = df.iloc[:,1:]
ynew = ynew.to_numpy()
lowPassCutoffFreq = 6.0 # Cut off frequency
Sample_freq = 150; #Target sample frequency
N = 2 # Order of the filter; In this case 2nd order
Wn = lowPassCutoffFreq/(Sample_freq/2) #Normalize frequency
b, a = signal.butter(5, Wn, btype='low',analog=False,output='ba')
#scipy.signal.butter(N, Wn, btype='low', analog=False, output='ba', fs=None)
output = signal.filtfilt(b, a, ynew, axis=0)
np.savetxt("enter directory path/Filtered_files/Filtered_Angular_position_*_*", output, delimiter = ', ', newline = "\n")
I am trying to read in all files in a directory, they are then low pass filtered. After that the results are saved one after the other but not in one file. The result gives each files with 3 columns and ideally I would like them to named with headers e.g. col1, col2, col3.
Without using glob, I can filter all my files individually but I have more than 100 such files.
Any help would be appreciated.
best wishes,
I have partially solved the issue apart from the header names:
import glob
import pandas as pd
from tnorma import tnorma
import seaborn as sns
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
path = r'location_of_dir'
all_files = glob.glob(path + '/*.csv')
# yn = np.zeros(shape = (101,1))
# tn = np.zeros(shape = (101,1))
#ynew = []
yn = np.zeros(shape = (101,1))
for filename in all_files:
df = pd.read_csv(filename, index_col=None, header=0)
print(filename)
foo = filename.split("/")[-1]
#df = pd.read_csv(f)
time = df.iloc[:,0]
time = time.to_numpy()
ynew = df.iloc[:,1:]
ynew = ynew.to_numpy()
#print(ynew)
lowPassCutoffFreq = 6.0 # Cut off frequency
Sample_freq = 150; #Target sample frequency
N = 2 # Order of the filter; In this case 2nd order
Wn = lowPassCutoffFreq/(Sample_freq/2) #Normalize frequency
b, a = signal.butter(5, Wn, btype='low',analog=False,output='ba')
#scipy.signal.butter(N, Wn, btype='low', analog=False, output='ba', fs=None)
output = signal.filtfilt(b, a, ynew, axis=0)
#print (output)
tn = np.linspace(0, 100, 101) # new time vector for the new time-normalized data
yn, tn, indie = tnorma(output, k=3, smooth =1, mask = None, show = False)
np.savetxt("path_name/foldername/file"+ foo, yn, delimiter = ', ', newline = "\n")
However, I am having difficulty in putting header names on the 3 columns per file.

Improve the speed of for loop over a loaded file

I have a dataset in text file in the following form:
5851F42D00000000,1
4BB5F64640B18CCF,2
742D2F7A0AE16FD9,1
76035E090D1F0796,1
6FA72CA540F7702C,3
.
.
.
The file contains 500K rows. My goal is to read the file and convert the hex values to binary. The following code works fine but it is very slow. Is there a trick to make it faster?
import pandas as pd
import numpy as np
df = pd.read_csv(path+ 'dataset.txt', sep=",", header=None)
X = []
y = []
for i, row in df.iterrows():
n = int('{:064b}'.format(int(row.values[0], 16)))
X.append(n)
y.append(row.values[1])
X = np.asarray(X)
y = np.asarray(y)
No need of redundant loop and appending to lists.
Use pandas "magic":
df = pd.read_csv('test.csv', sep=",", header=None)
x = df[0].apply(lambda x: int('{:064b}'.format(int(x, 16)))).to_numpy()
y = df[1].to_numpy()
print(x, y)

Use of datetime timedelta with numpy 3d array

I have a 3D array with the count of number of days past a benchmark date (e.g., 01.01.2000). I am interested in the actual day-of-year (DOY: 1-365/366)rather than the total number of days past a given date.
For a single value, the below syntax works. For e.g.,
import numpy as np
import datetime
data = 1595
date = datetime.datetime(2000,1,1,0,0) + datetime.timedelta(data -1)
date.timetuple().tm_yday
134
However, I am having issues with using a 3D array.
import numpy as np
import datetime
data = np.random.randint(5, size = (2,2,2))
data = data + 1595
data
array([[[1596, 1595],
[1599, 1599]],
[[1596, 1599],
[1595, 1595]]])
#Function
def Int_to_DOY(int_array):
date_ = datetime.datetime(2000,1,1,0,0) + datetime.timedelta(int_array - 1)
return date_.timetuple().tm_yday
doy_data = data * 0 #Empty array
for i in range(2):
doy_data[:, :, i] = Int_to_DOY(data[:, :, i])
Here is the error message and I am not able to figure this out.
TypeError: unsupported type for timedelta days component: numpy.ndarray
Thanks for your help.
import numpy as np
import datetime
data = np.random.randint(5, size = (2,2,2))
data = data + 1595
#Function
def Int_to_DOY(int_array):
date_ = datetime.datetime(2000,1,1,0,0) + datetime.timedelta(int(int_array) -1)
return date_.timetuple().tm_yday
doy_data = data.flatten()
for i in range(len(doy_data)):
doy_data[i] = Int_to_DOY(doy_data[i])
doy_data = doy_data.reshape((2,2,2))
Since you tagged pandas:
data = np.array([[[1596, 1595],
[1599, 1599]],
[[1596, 1599],
[1595, 1595]]])
s = pd.to_datetime('2000-01-01') + pd.to_timedelta(data.ravel(), unit='D')
s.dayofyear.values.reshape(data.shape) - 1
Output:
array([[[135, 134],
[138, 138]],
[[135, 138],
[134, 134]]], dtype=int64)

Why is plot returning "ValueError: could not convert string to float:" when a dataframe column of floats is being passed to the plot function?

I am trying to plot a dataframe I have created from an excel spreadsheet using either matplotlib or matplotlib and pandas ie. df.plot. However, python keeps returning a cannot convert string to float error. This is confusing since when I print the column of the dataframe it appears to be all float values.
I've tried printing the values of the dataframe column and using the pandas.plot syntax. I've also tried saving the column to a new variable.
import pandas as pd
from matplotlib import pyplot as plt
import glob
import openpyxl
import math
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Border, Side, Alignment
import seaborn as sns
import itertools
directory = 'E:\some directory'
#QA_directory = directory + '**/*COPY.xlsx'
wb = openpyxl.load_workbook(directory + '\\Calcs\\' + "excel file.xlsx", data_only = 'True')
plt.figure(figsize=(16,9))
axes = plt.axes()
plt.title('Drag Amplification', fontsize = 16)
plt.xlabel('Time (s)', fontsize = 14)
plt.ylabel('Cf', fontsize = 14)
d = pd.DataFrame()
n=[]
for sheets in wb.sheetnames:
if '2_1' in sheets and '2%' not in sheets and '44%' not in sheets:
name = sheets[:8]
print(name)
ws = wb[sheets]
data = ws.values
cols = next(data)[1:]
data = list(data)
idx = [r[0] for r in data]
data = (itertools.islice(r, 1, None) for r in data)
df = pd.DataFrame(data, index=idx, columns=cols)
df = df.dropna()
#x = df['x/l']
#y = df.Cf
print(df.columns)
print(df.Cf.values)
x=df['x/l'].values
plt.plot(x, df.Cf.values)
"""x = [wb[sheets].cell(row=row,column=1).value for row in range(1,2000) if wb[sheets].cell(row=row,column=1).value]
print(x)
Cf = [wb[sheets].cell(row=row,column=6).value for row in range(1,2000) if wb[sheets].cell(row=row,column=1).value]
d[name+ 'x'] = pd.DataFrame(x)
d[name + '_Cf'] = pd.Series(Cf, index=d.index)
print(name)"""
print(df)
plt.show()
I'm expecting a plot of line graphs with the values of x/l on the x access and Cf on the 'y' with a line for each of the relevant sheets in the workbook. Any insights as to why i am getting this error would be appreciated!

Resources