Import and parse .data file

Import and parse .data file - python-3.x

there is a file I tried to import and safe as pandas df. At a first sight looks like it's already columns and rows ordered, but finally I had to do a bunch of stuff to create pandas df. Could you please check if there is much faster way to manage it?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
My way of doing it is:
import requests
import pandas as pd
r = requests.get(url)
file = r.text
step_1 = file.split('\n')
for n in range(len(step_1)): # remove empty strings
if bool(step_1[n]) == False:
del(step_1[n])
step_2 = [i.split('\t') for i in step_1]
cars_names = [i[1] for i in step_2]
step_3 = [i[0].split(' ') for i in step_2]
for e in range(len(step_3)): # remove empty strings in each sublist
step_3[e] = [item for item in step_3[e] if item != '']
mpg = [i[0] for i in step_3]
cylinders = [i[1] for i in step_3]
disp = [i[2] for i in step_3]
horsepower = [i[3] for i in step_3]
weight = [i[4] for i in step_3]
acce = [i[5] for i in step_3]
year = [i[6] for i in step_3]
origin = [i[7] for i in step_3]
list_cols = [cars_names, mpg, cylinders, disp, horsepower, weight, acce, year, origin]
# list_labels written manually:
list_labels = ['car name', 'mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin']
zipped = list(zip(list_labels, list_cols))
data = dict(zipped)
df = pd.DataFrame(data)

When you replaced \t to blankspace, you can use read_csv to read it. But you need to wrap up your text, because the first parameter in read_csv is filepath_or_buffer which needs object with a read() method (such as a file handle or StringIO). Then your question can be transform to read_csv doesn't read the column names correctly on this file?
import requests
import pandas as pd
from io import StringIO
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
r = requests.get(url)
file = r.text.replace("\t"," ")
# list_labels written manually:
list_labels = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin','car name']
df = pd.read_csv(StringIO(file),sep="\s+",header = None,names=list_labels)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(df)

Related

Perform code on multiple files 1 by 1 pandas

Hi I have code I have written to read a .csv file in a folder and add some required columns.
I now want to perform this code on multiple files within the path folder 1 by 1 and save each as a separate df.
My current code is as follows
import pandas as pd
import glob
import os
path = r'C:\Users\jake.jennings.BRONCO\Desktop\GPS Reports\Games\Inputs\2022-03-27 Vs
Cowboys\Test' # use your path
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
frame = pd.read_csv(filename, index_col=None, skiprows=8)
li.append(frame)
frame = pd.concat(li, axis=0, ignore_index=True)
frame['filename'] = os.path.basename
#Add odometer change and turn all accel values to positive
import numpy as np
frame['OdChange'] = frame['Odometer'].diff()
frame['accelpos'] = frame['Acceleration'].abs()
#Add column with OdChange # >5.5m/s
frame["new1"] = np.where(
(frame.Velocity >=5.5),
frame["OdChange"],
'0')
#Add column with accels/decels >2.5m.s.s for AccelDec/min
frame["new2"] = np.where(
(frame.accelpos >=2.5),
frame["accelpos"],
'0')
#Add column with accels/decels >2.5m.s.s for AccelDec/min
frame["new3"] = np.where(
(frame.Acceleration >=2.5),
'1',
'0')
s = frame['new3'].astype(int)
frame['new4'] = s.diff().fillna(s).eq(1).astype(int)
frame['new4']
#m/min peaks
frame['1minOD'] = frame['OdChange'].rolling(window=600, axis=0).sum()
#HSm/min peaks
frame['1minHS'] = frame['new1'].rolling(window=600, axis=0).sum()
#AccImpulse/min
frame['1minImp'] = frame['accelpos'].rolling(window=600, axis=0).mean() *60
#AccDec Peak Count
frame['1minAccCount'] = frame['new4'].rolling(window=600, axis=0).sum()
print (frame)
I am not sure if this is even the best way to do what I am trying to do. Any help would be appreciated!

Pandas - Add items to dataframe

I am trying to add row items to the dataframe, and I am not able to update the dataframe.
What i tried until now is commented out as it doesn't do what I need.
I simply want to download the json file and store it to a dataframe with those given columns. Seems I am not able to extract the child components fron JSON file and store them to a brand new dataframe.
Please find bellow my code:
import requests, json, urllib
import pandas as pd
url = "https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json"
data = pd.read_json(url)
headers = []
df = pd.DataFrame()
for key, item in data['vulnerabilities'].items():
for k in item.keys():
headers.append(k)
col = list(set(headers))
new_df = pd.DataFrame(columns=col)
for item in data['vulnerabilities'].items():
print(item[1])
# new_df['product'] = item[1]['product']
# new_df['vendorProject'] = item[1]['vendorProject']
# new_df['dueDate'] = item[1]['dueDate']
# new_df['shortDescription'] = item[1]['shortDescription']
# new_df['dateAdded'] = item[1]['dateAdded']
# new_df['vulnerabilityName'] = item[1]['vulnerabilityName']
# new_df['cveID'] = item[1]['cveID']
# new_df.append(item[1], ignore_index = True)
new_df
At the end my df is still blank.

The nested JSON data can be directly converted to a flattened dataframe using pd.json_normalize(). The headers are extracted from the JSON itself.
new_df = pd.DataFrame(pd.json_normalize(data['vulnerabilities']))
UPDATE: Unnested the vulnerabilities column specifically.
Output:

It worked with this:
import requests, json, urllib
import pandas as pd
url = "https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json"
data = pd.read_json(url)
headers = []
df = pd.DataFrame()
for key, item in data['vulnerabilities'].items():
for k in item.keys():
headers.append(k)
col = list(set(headers))
new_df = pd.DataFrame(columns=col)
for item in data['vulnerabilities'].items():
new_df.loc[len(new_df.index)] = item[1] <===THIS
new_df.head()

Pass url column's values one by one to web crawler code in Python

Based on the answered code from this link, I'm able to create a new column: df['url'] = 'https://www.cspea.com.cn/list/c01/' + df['projectCode'].
Next step I would like to pass the url column's values to the following code and append all the scrapied contents as dataframe.
import urllib3
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.cspea.com.cn/list/c01/gr2021bj1000186" # url column's values should be passed here one by one
soup = BeautifulSoup(requests.get(url, verify=False).content, "html.parser")
index, data = [], []
for th in soup.select(".project-detail-left th"):
h = th.get_text(strip=True)
t = th.find_next("td").get_text(strip=True)
index.append(h)
data.append(t)
df = pd.DataFrame(data, index=index, columns=["value"])
print(df)
How could I do that in Python? Thanks.
Updated:
import requests
from bs4 import BeautifulSoup
import pandas as pd
df = pd.read_excel('items_scraped.xlsx')
data = []
urls = df.url.tolist()
for url_link in urls:
url = url_link
# url = "https://www.cspea.com.cn/list/c01/gr2021bj1000186"
soup = BeautifulSoup(requests.get(url, verify=False).content, "html.parser")
index, data = [], []
for th in soup.select(".project-detail-left th"):
h = th.get_text(strip=True)
t = th.find_next("td").get_text(strip=True)
index.append(h)
data.append(t)
df = pd.DataFrame(data, index=index, columns=["value"])
df = df.T
df.reset_index(drop=True, inplace=True)
print(df)
df.to_excel('result.xlsx', index = False)
But it only saved one rows into excel file.

You need to combine the dfs generated in the loop. You could add them to a list and then call pd.concat on that list.
import requests
from bs4 import BeautifulSoup
import pandas as pd
df = pd.read_excel('items_scraped.xlsx')
# data = []
urls = df.url.tolist()
dfs = []
for url_link in urls:
url = url_link
# url = "https://www.cspea.com.cn/list/c01/gr2021bj1000186"
soup = BeautifulSoup(requests.get(url, verify=False).content, "html.parser")
index, data = [], []
for th in soup.select(".project-detail-left th"):
h = th.get_text(strip=True)
t = th.find_next("td").get_text(strip=True)
index.append(h)
data.append(t)
df = pd.DataFrame(data, index=index, columns=["value"])
df = df.T
df.reset_index(drop=True, inplace=True)
print(df)
dfs.append(df)
df = pd.concat(dfs)
df.to_excel('result.xlsx', index = False)

Use
urls = df.url.tolist()
To create a list of URLs and then iterate through them using f string to insert each one into your base url

Reading in multiple files in Python and saving them one by one in a different directory

import glob
import pandas as pd
import seaborn as sns
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
files = glob.glob("Angular_position_*_*.csv")
output = pd.DataFrame()
for f in files:
df = pd.read_csv(f)
time = df.iloc[:,0]
time = time.to_numpy()
ynew = df.iloc[:,1:]
ynew = ynew.to_numpy()
lowPassCutoffFreq = 6.0 # Cut off frequency
Sample_freq = 150; #Target sample frequency
N = 2 # Order of the filter; In this case 2nd order
Wn = lowPassCutoffFreq/(Sample_freq/2) #Normalize frequency
b, a = signal.butter(5, Wn, btype='low',analog=False,output='ba')
#scipy.signal.butter(N, Wn, btype='low', analog=False, output='ba', fs=None)
output = signal.filtfilt(b, a, ynew, axis=0)
np.savetxt("enter directory path/Filtered_files/Filtered_Angular_position_*_*", output, delimiter = ', ', newline = "\n")
I am trying to read in all files in a directory, they are then low pass filtered. After that the results are saved one after the other but not in one file. The result gives each files with 3 columns and ideally I would like them to named with headers e.g. col1, col2, col3.
Without using glob, I can filter all my files individually but I have more than 100 such files.
Any help would be appreciated.
best wishes,

I have partially solved the issue apart from the header names:
import glob
import pandas as pd
from tnorma import tnorma
import seaborn as sns
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
path = r'location_of_dir'
all_files = glob.glob(path + '/*.csv')
# yn = np.zeros(shape = (101,1))
# tn = np.zeros(shape = (101,1))
#ynew = []
yn = np.zeros(shape = (101,1))
for filename in all_files:
df = pd.read_csv(filename, index_col=None, header=0)
print(filename)
foo = filename.split("/")[-1]
#df = pd.read_csv(f)
time = df.iloc[:,0]
time = time.to_numpy()
ynew = df.iloc[:,1:]
ynew = ynew.to_numpy()
#print(ynew)
lowPassCutoffFreq = 6.0 # Cut off frequency
Sample_freq = 150; #Target sample frequency
N = 2 # Order of the filter; In this case 2nd order
Wn = lowPassCutoffFreq/(Sample_freq/2) #Normalize frequency
b, a = signal.butter(5, Wn, btype='low',analog=False,output='ba')
#scipy.signal.butter(N, Wn, btype='low', analog=False, output='ba', fs=None)
output = signal.filtfilt(b, a, ynew, axis=0)
#print (output)
tn = np.linspace(0, 100, 101) # new time vector for the new time-normalized data
yn, tn, indie = tnorma(output, k=3, smooth =1, mask = None, show = False)
np.savetxt("path_name/foldername/file"+ foo, yn, delimiter = ', ', newline = "\n")
However, I am having difficulty in putting header names on the 3 columns per file.

DataFrame append generates TypeError

I am trying to write a function to write and read transaction details to/from a .h5 file. I want to effectively use one file to store some transaction details, and when necessary, derive the details. Here's my code:
import h5py
import numpy as np
import pandas as pd
from datetime import datetime
from os import listdir
from pandas import HDFStore
def maintainLedger(mode, tick, lastBuyy = 0, lastSell = 0, quan = 0, prof = 0):
"""THIS FUNCTION WRITES AND READS TRANSACTION DETAILS.
mode = 0 - IF FILE EXITS, READ FILE
mode = 1 - IF FILE EXITS, APPEND TO FILE"""
# CHECK IF LEDGER FILE EXISTS, IF NOT CREATE A LEDGER FILE FOR THE FIRST TIME
path = r'ledger'
suff = r'h5'
flie = listdir(path)
flie = [item for item in flie if item.endswith(suff)]
if len(flie) == 0:
HDF5Data = HDFStore('ledger/ledger.h5')
# GENERATE NEW VALUES OF DATE/TIME
mi = int(datetime.now().minute)
ho = int(datetime.now().hour)
da = int(datetime.now().day)
we = int(datetime.now().isocalendar()[1])
mo = int(datetime.now().month)
ye = int(datetime.now().year)
newwData = np.array([mode, mi, ho, da, we, mo, ye, tick, lastBuyy, lastSell, quan, prof]).reshape(1, 12)
newwData = pd.DataFrame(newwData, columns = ['mode', 'mi', 'ho', 'da', 'we', 'mo', 'ye', 'tick', 'laBu', 'laSe', 'quan', 'prof'])
HDF5Data.put('data', newwData, format = 'table', data_columns = True)
HDF5Data.close()
elif len(flie) == 1:
if mode == 0:
# READ PREVIOUSLY SAVED DATA AS PANDAS DATAFRAME
readData = pd.read_hdf('ledger/ledger.h5', mode = 'r')
# DO SOMETHING...
elif mode == 1:
# GENERATE NEW VALUES OF DATE/TIME
mi = int(datetime.now().minute)
ho = int(datetime.now().hour)
da = int(datetime.now().day)
we = int(datetime.now().isocalendar()[1])
mo = int(datetime.now().month)
ye = int(datetime.now().year)
# GATHER NEW DATA INTO NUMPY ARRAY AND CONVERT TO PANDAS DATAFRAME
newwData = np.array([mode, mi, ho, da, we, mo, ye, tick, lastBuyy, lastSell, quan, prof]).reshape(1, 12)
newwData = pd.DataFrame(newwData, columns = ['mode', 'mi', 'ho', 'da', 'we', 'mo', 'ye', 'tick', 'laBu', 'laSe', 'quan', 'prof'])
# READ PREVIOUSLY SAVED DATA AS PANDAS DATAFRAME AND APPEND NEW DATA
readData = pd.read_hdf('ledger/ledger.h5', mode = 'a')
readData.append('data', newwData)
tempData = pd.read_hdf('ledger/ledger.h5', mode = 'r')
print(tempData)
else:
print('Please check input data for errors!')
if __name__ == '__main__':
maintainLedger(1, "AAPL")
When I run the code, I am getting the following error:
TypeError: cannot concatenate object of type "<class 'str'>"; only pd.Series, pd.DataFrame, and pd.Panel (deprecated) objs are valid
I have tried looking for a solution, and a quick search led me to this, which didn't solve my problem. Is there something I am doing wrong? Any advice would be appreciated.

import h5py
import numpy as np
import pandas as pd
from datetime import datetime
from os import listdir
from pandas import HDFStore
def maintainLedger(mode, tick = 'QUERY', lastBuyy = 0, lastSell = 0, quan = 0, prof = 0):
"""THIS FUNCTION WRITES AND READS TRANSACTION DETAILS.
mode = 0 - IF FILE EXITS, READ FILE
mode = 1 - IF FILE EXITS, APPEND TO FILE"""
# CHECK IF LEDGER FILE EXISTS, IF NOT CREATE A LEDGER FILE FOR THE FIRST TIME
path = r'ledger'
suff = r'h5'
flie = listdir(path)
flie = [item for item in flie if item.endswith(suff)]
if len(flie) == 0:
# GENERATE NEW VALUES OF DATE/TIME
mi = int(datetime.now().minute)
ho = int(datetime.now().hour)
da = int(datetime.now().day)
we = int(datetime.now().isocalendar()[1])
mo = int(datetime.now().month)
ye = int(datetime.now().year)
# GATHER NEW DATA INTO NUMPY ARRAY AND CONVERT TO PANDAS DATAFRAME
newwData = np.array([mode, mi, ho, da, we, mo, ye, tick, lastBuyy, lastSell, quan, prof]).reshape(1, 12)
newwData = pd.DataFrame(newwData, columns = ['mode', 'mi', 'ho', 'da', 'we', 'mo', 'ye', 'tick', 'laBu', 'laSe', 'quan', 'prof'])
# SAVE ALL DATA INTO .H5 FORMAT
HDF5Data = HDFStore('ledger/ledger.h5')
HDF5Data.put('data', newwData, format = 'table', data_columns = True)
HDF5Data.close()
elif len(flie) == 1:
if mode == 0:
"""THIS OPTION ENABLES CODE TO READ DATA."""
# READ PREVIOUSLY SAVED DATA AS PANDAS DATAFRAME
readData = pd.read_hdf('ledger/ledger.h5', mode = 'r')
# DO SOMETHING...
print(readData)
elif mode == 1:
"""THIS OPTION ENABLES CODE TO APPEND DATA."""
# GENERATE NEW VALUES OF DATE/TIME
mi = int(datetime.now().minute)
ho = int(datetime.now().hour)
da = int(datetime.now().day)
we = int(datetime.now().isocalendar()[1])
mo = int(datetime.now().month)
ye = int(datetime.now().year)
# GATHER NEW DATA INTO NUMPY ARRAY AND CONVERT TO PANDAS DATAFRAME
newwData = np.array([mode, mi, ho, da, we, mo, ye, tick, lastBuyy, lastSell, quan, prof]).reshape(1, 12)
newwData = pd.DataFrame(newwData, columns = ['mode', 'mi', 'ho', 'da', 'we', 'mo', 'ye', 'tick', 'laBu', 'laSe', 'quan', 'prof'])
# READ PREVIOUSLY SAVED DATA AS PANDAS DATAFRAME AND APPEND NEW DATA
readData = pd.read_hdf('ledger/ledger.h5', mode = 'r')
readData = readData.append(newwData)
# SAVE ALL DATA INTO .H5 FORMAT
HDF5Data = HDFStore('ledger/ledger.h5')
HDF5Data.put('data', readData, format = 'table', data_columns = True)
HDF5Data.close()
else:
print('Please check input data for errors!')
if __name__ == '__main__':
maintainLedger(1, 'MSFT')

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Import and parse .data file - python-3.x

Related

Perform code on multiple files 1 by 1 pandas

Pandas - Add items to dataframe

Pass url column's values one by one to web crawler code in Python

Reading in multiple files in Python and saving them one by one in a different directory

DataFrame append generates TypeError

Categories

Resources