Loop over excel files' paths under a directory and pass them to data manipulation function in Python - python-3.x

I need to check the excel files under a directory /Users/x/Documents/test/ by DataCheck function from data_check.py, so I can do data manipulation of many excel files, data_check.py has code structure as follows:
import pandas as pd
def DataCheck(filePath):
df = pd.read_excel(filePath)
try:
df = df.dropna(subset=['building', 'floor', 'room'], how = 'all')
...
...
...
df.to_excel(writer, 'Sheet1', index = False)
if __name__ == '__main__':
status = True
while status:
rawPath = input(r"")
filePath = rawPath.strip('\"')
if filePath.strip() == "":
status = False
DataCheck(filePath)
In order to loop all the excel files' paths under a directory, I use:
import os
directory = '/Users/x/Documents/test/'
for filename in os.listdir(directory):
if filename.endswith(".xlsx") or filename.endswith(".xls"):
print(os.path.join(directory, filename))
else:
pass
Out:
/Users/x/Documents/test/test 3.xlsx
/Users/x/Documents/test/test 2.xlsx
/Users/x/Documents/test/test 4.xlsx
/Users/x/Documents/test/test.xlsx
But I don't know how to combine the code above together, to pass the excel files' paths to DataCheck(filePath).
Thanks for your kind help at advance.

Call the function with the names instead of printing them:
import os
directory = '/Users/x/Documents/test/'
for filename in os.listdir(directory):
if filename.endswith(".xlsx") or filename.endswith(".xls"):
fullname = os.path.join(directory, filename)
DataCheck(fullname)

Related

Using pandas pd.Excel File with user input for folder path and filename

I'm using pd.ExcelFile as below to open and parse a file, but currently only with the actual folder path and filename in one string.
wb = pd.ExcelFile(folder_path+filename)
I want to put this into a function, that asks the user to give a path and filename and deals with invalid input. I started something like the below, but it doesn't seem like the error is being generated inside the function anyway, and i'm not sure how to say 'while wb isn't a valid thing' to continue to prompt for a filepath until we get a valid one?
def Load_Parse():
folder_path = input('\nEnter the path to the qry_T spreadsheet here (include slashes at the start and at the end): ')
filename = input('\nEnter the name of the spreadsheet to be used here: ')
sheetname = input('\nEnter the sheet containing the data here, including the extension (e.g. "qry_Trajectory 2019.xlsx": ')
try:
wb = pd.ExcelFile(folder_path+filename)
except FileNotFoundError:
Any ideas?
I'll then parse the file using a similar method i hope:
df = wb.parse('filename')
using Pathlib, os and pandas and a few functions.
one of the key functions you'll need is the while True which keeps executing a block of code until it's true and you initiate a break
feel free to edit to your own spec.
Modules
from pathlib import Path
import os
import pandas as pd
from xlrd import XLRDError
In Action
df = load_parser()
out:
#Hello Umar.Hussain please enter a valid target directory
#C:\Users\UmarH\Files
#1 excels_0
#2 excels_1
#Choose a number between 1 and 2
1
#Your Choice is excels_0.xlsx
#Choose a Sheet - Lists all sheets
'Sheet1'
# returns dataframe
Main Function
def load_parser():
user = os.getlogin()
print(f"Hello {user} please enter a valid target directory")
cmd = input('')
p = file_tester(cmd,file_type='path')
print("Please select a number from the following file")
target_file = create_excel_dict(p)
target_df = enumerate_sheets(target_file)
return target_df
Helper Functions
def file_tester(string_path, file_type="path"):
path = Path(string_path)
while True:
if path.is_dir():
break
else:
cmd = input(f"Please Enter a Valid {file_type}")
path = Path(cmd)
return path
def create_excel_dict(target_path):
xlsx_dict = {i: x for i, x in enumerate(target_path.glob('*.xlsx'), 1)}
for k,v in xlsx_dict.items():
print(k,v.stem)
rng = [i for i in xlsx_dict.keys()]
file_choice = input(f'Choose a number between {rng[0]} and {rng[-1]}')
while True:
try:
file_choice = int(file_choice)
print(f"Your Choice is {xlsx_dict[file_choice]}")
break
except KeyError:
file_choice = input(f'Choose a number between {rng[0]} and {rng[-1]}')
return xlsx_dict[file_choice]
def enumerate_sheets(target_file):
xl = pd.ExcelFile(target_file)
for sheet in xl.sheet_names:
print(sheet)
target_sheet = input("Please Type Your sheet name")
while True:
try:
df = pd.read_excel(xl,sheet_name=target_sheet)
break
except XLRDError:
target_sheet = input("Please enter a sheet from above.")
return df

Trouble reading csvs saved in sharefile (citrix)

I wrote the following code to create dataframes from files saved in sharefile. It works perfectly for excel files, but fails for csv files with the error EmptyDataError: No columns to parse from file.
tblname = 'test'
fPth = r'Z:\Favorites\test10 (Group D - Custom EM&V)\8 PII\16 - Project Selection Plan\QC\Data\test.csv'
sht = 'Gross_Data'
shtStart = 0
fType = 'csv'
fitem = sfsession.get_io_version(fPth)
if fitem is None:
print(f'Could not create sharefile item for {fPth}')
else:
try:
if fType == 'csv':
df = pd.read_csv(fitem.io_data, header = shtStart)
elif fType == 'excel':
df = pd.read_excel(fitem.io_data, sheet_name = sht, header = shtStart)
else:
pass
print(f'Data import COMPLETE for {fPth}: {str(datetime.now())}')
except:
print(f'Data import FAILED for {fPth}')
logging.critical(f'Data import FAILED for {fPth}')
If I replace fitem.io_data with fPth in df = pd.read_csv, the code works, but I can't use that as a permanent solution. Any suggestions?
Also sfsession is a sharefile session and get_io_version(fPth) gets the token and downloads all the file properties include its data.
Thanks.
An adaptation of this solution worked for me:
StringIO and pandas read_csv
I added fitem.io_data.seek(0) before the df = ... line
Closing the question.

Change order in filenames in a folder

I need to rename a bunch of files in a specific folder. They all end with date and time, like for example "hello 2019-05-22 1310.txt" and I want the date and time for each file to be first so I can sort them. With my code I get an error and it wont find my dir where all files are located. What is wrong with the code?
import os
import re
import shutil
dir_path = r'C:\Users\Admin\Desktop\Testfiles'
comp = re.compile(r'\d{4}-\d{2}-\d{2}')
for file in os.listdir(dir_path):
if '.' in file:
index = [i for i, v in enumerate(file,0) if v=='.'][-1]
name = file[:index]
ext = file[index+1:]
else:
ext=''
name = file
data = comp.findall(name)
if len(data)!=0:
date= comp.findall(name)[0]
rest_name = ' '.join(comp.split(name)).strip()
new_name = '{} {}{}'.format(date,rest_name,'.'+ext)
print('changing {} to {}'.format(name, new_name))
shutil.move(os.path.join(dir_path,name), os.path.join(dir_path, new_name))
else:
print('file {} is not change'.format(name))

Save CSV for every functioncall with another name

at the moment I am able to create one CSV file with all the content I get at once.
Now I would like to create a list where I have different names in it.
How can I produce for every functioncall a different CSV file name? I thought about looping a list but I just want a +1 iteration at each call. I thought about saving my state somehow and use it in next functioncall. Everytime I initialize my variable with 0 and so I don't get 1. I think I could do it with Python Function Parameter calls but I have no idea how to use it. Can someone give me a little tip or example? If there are better ideas (maybe my idea is totally bullshit), how to solve this, just help please.
The comments in the code shall represent my imagination.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from tenable.sc import SecurityCenter as SC
import os.path
import sys
import getpass
import csv
SC_HOST = '...'
def parse_entry(entry):
split_after_path = ''
ip = entry.get('ip', None)
pluginText = entry.get('pluginText', None)
if 'Path : ' in pluginText:
for line in pluginText.splitlines(0):
if 'Path : ' in line:
split_after_path_in_plugintext = line.split("Path : ",1)[1]
# place = ['place1', 'place2', 'place3', 'place4', 'place5']
# i = 0
# i = i+1
file_exists = os.path.isfile('testfile_path.csv')
# file_exists = os.path.isfile('testfile_path_'+place[i]+'.csv')
data = open('testfile_path.csv', 'a')
# data = open('testfile_path_'+place[i]+'.csv', 'a')
with data as csvfile:
header = ['IP Address', 'Path']
writer = csv.DictWriter(csvfile, lineterminator='\n', quoting=csv.QUOTE_NONNUMERIC, fieldnames=header)
if not file_exists:
writer.writeheader()
writer.writerow({'IP Address': ip, 'Path': split_after_path})
data.close()
def main():
sc_user = input('[<<] username: ')
sc_pass = getpass.getpass('[<<] password: ')
sc = SC(SC_HOST)
sc.login(sc_user, sc_pass)
# Query API for data
# asset = [12,13,14,25,29]
# i = 0
# assetid = asset[i]
# vuln = sc.analysis.vulns(('pluginID', '=', '25072')('asset','=','assetid'))
# i = i+1
vuln = sc.analysis.vulns(('pluginID', '=', '25072'),('asset','=','11'))
for entry in vuln:
parse_entry(entry)
sc.logout()
return 0
if __name__ == '__main__':
sys.exit(main())
The simplest and most obvious solution is to pass the full file path to your parse_entry function, ie:
def parse_entry(entry, filepath):
# ...
if 'Path : ' in pluginText:
for line in pluginText.splitlines(0):
if 'Path : ' in line:
# ...
file_exists = os.path.isfile(filepath)
with open(filepath, 'a') as csvfile:
# ...
Then in main() use enumerate() to build sequential filenames:
def main():
# ...
for i, entry in enumerate(vuln):
path = "'testfile_path{}.csv".format(i)
parse_entry(entry, path)
You can use a function attribute to keep track of the number of times the function has been called.
def parse_entry(entry):
parse_entry.i += 1
# outside the function you have to initialize the attribute
parse_entry.i = 0
Or you can look at other ways to initialize the function attribute in this post.
Alternatively, you can use glob to get the current number of files.
from glob import glob
i = len(glob('testfile_path_*.csv'))

txt file to an specific format

I have a .txt file with some data I would like to convert to xls. The txt file has this format:
1325 2016-09-08 13:42:35
1325 2016-09-08 21:52:24
1325 2016-09-10 13:00:26
1325 2016-09-10 20:47:39
and more data. What I would like to do, is to have a .xls file that contains, inthe first column the first number in the .txt file, in the second column the date of the process, the third column the time of the first process and in the forth time the last time the process was made. I do it manually because I don't know alot of programming, the only thing I could do was to convert it to .xls but basically the file was converted to .xls with no changes. The code I used I found it on internet. How can I do it?
The code I used is:
from os import listdir
from os.path import isfile, join
import xlwt
import xlrd
mypath = input("Please enter the directory path for the input files: ")
textfiles = [ join(mypath,f) for f in listdir(mypath) if isfile(join(mypath,f)) and '.txt' in f]
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
style = xlwt.XFStyle()
style.num_format_str = '#,###0.00'
for textfile in textfiles:
f = open(textfile, 'r+')
row_list = []
for row in f:
row_list.append(row.split('|'))
column_list = zip(*row_list)
workbook = xlwt.Workbook()
worksheet = workbook.add_sheet('Sheet1')
i = 0
for column in column_list:
for item in range(len(column)):
value = column[item].strip()
if is_number(value):
worksheet.write(item, i, float(value), style=style)
else:
worksheet.write(item, i, value)
i+=1
workbook.save(textfile.replace('.txt', '.xls'))

Resources