Loop over files - python-3.x

I have a series of files named file_0001.csv, file_0002.csv, ... file_1000.csv etc. I need to read them iteratively by creating a list of the filenames and as
import numpy as np
import pandas as pd
for fileName in files:
data = pd.read_csv("folder" + fileName)
data = data.values
How do I create the list of file names by checking the first and last file.
Thank you for your help

if you are trying to create the specific file names, you can loop over from 1 to 1000 and create filenames
import numpy as np
import pandas as pd
for i in range(1,1001):
num = str(i)
filename = "file_" + num.zfill(4) + ".csv"
#data = pd.read_csv("folder" + fileName)
#data = data.values
print filename
output last 10 lines:
file_0991.csv
file_0992.csv
file_0993.csv
file_0994.csv
file_0995.csv
file_0996.csv
file_0997.csv
file_0998.csv
file_0999.csv
file_1000.csv

You should use pathlib from the standard library. Then you can simply do
import pandas as pd
from pathlib import Path
# get file number assuming the name format "file_number.csv"
def get_file_number(file_path):
return int(file_path.stem.split("_")[1])
folder_path = Path("path/to/folder")
# sort files by file number
files = sorted(folder_path.glob("file_*.csv"), key=get_file_number)
for file in files:
print(file.name) # just to check
data = pd.read_csv(file)
# do something with data

Related

PysimpleGUI Combo source from Excel

Dears,
I'm using excel file as my data source, and in my program I'm using sg.combo , where it contains a long list, consequently, I want to have this list in one of the sheets in my excel file, how to do that ?
from pathlib import Path
import PySimpleGUI as sg
import pandas as pd
current_dir = Path(__file__).parent if '__file__' in locals() else Path.cwd()
EXCEL_FILE = current_dir / 'Example.xlsx'
df = pd.read_excel(EXCEL_FILE)
Thanks

Header is repeating when merging multiple files using Python shell in AWS Glue

I am new to Python and AWS Glue.
I am trying to merge few excel files in a S3 source bucket and generate 1 output file (csv) in a target S3 bucket. I am able to read and generate the output file with merged data but the only problem is that the header is repeating from each file.
Can someone help to debug to remove the repeating headers?
Below is my code:
import pandas as pd
import glob
import xlrd
import openpyxl
import boto3
import io
import json
import os
from io import StringIO
import numpy as np
s3 = boto3.resource('s3')
bucket = s3.Bucket('test bucket')
prefix_objs = bucket.objects.filter(Prefix='source/file')
prefix_df = []
for obj in prefix_objs:
key = obj.key
print(key)
temp = pd.read_excel(obj.get()['Body'], encoding='utf8')
prefix_df.append(temp)
bucket = 'test bucket'
csv_buffer = StringIO()
for current_df in prefix_df:
current_df.to_csv(csv_buffer, index = None)
print(current_df)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'merge.csv').put(Body=csv_buffer.getvalue())
Please help!
Regards,
Vijay
Change this line and add the parameter header.
temp = pd.read_excel(obj.get()['Body'], encoding='utf8')
to
temp = pd.read_excel(obj.get()['Body'], encoding='utf8', header=1)
or
temp = pd.read_excel(obj.get()['Body'], encoding='utf8', skiprows=1)
You need to test the header value, because sometimes the header starts not in the first row.
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html

Dynamically insert file name and path for saving csv in pandas

I need to dynamically save my pandas data frame. I have successfully managed to output a CSV file with a static name using the following code:
export_csv = df.to_csv(r'static_name.csv', header=False , index=False)
But I have failed to make this work dynamically. With the code below, I expect to get a file with the name passed into args.save_file_name and .csv suffix. However, I get no result.
import os
import argparse
import pandas as pd
parser = argparse.ArgumentParser()
parser.add_argument('file', help="this is the file you want to open")
parser.add_argument('save_file_name', help="the name of the file you want for the output CSV")
args = parser.parse_args()
print("file name:", args.file) # checking that this worked
...
# export csv
path = os.getcwd()
export_path = path + args.save_file_name + '.csv'
export_csv = df.to_csv(path_or_buf=export_path, header=False, index=False)
I think, that problem is in your export_path variable, that is not made right. The following code should do the job.
export_path = os.path.join(path, args.save_file_name + '.csv')

Rename a file/ remove numbers from a file name in python 3

My code runs well when I didn't ask to rename the file, just to print all file names without number, but when I did it and checked file instead of running it, it just doesn't work.
here's my code:
import os
import re
def rename_file():
file_list = os.listdir(r"C:\Users\Zoe.Zhao\Desktop\prank")
saved_path = os.getcwd()
os.chdir(r"C:\Users\Zoe.Zhao\Desktop\prank")
print (saved_path)
for file_name in file_list:
file_name = re.sub('[0-9]','',file_name)
print(file_name)
rename_file()
Here are some sreenshots:
before:
after:
You need to create a new list: see below:
import os
import re
def rename_file():
file_list = os.listdir(r"C:\Users\afareh\Dropbox\PROGRAMMING\test\prank\prank")
saved_path = os.getcwd()
os.chdir(r"C:\Users\afareh\Dropbox\PROGRAMMING\test\prank\prank")
new_file_list=[] # create an empty list for storing the names of the renamed files.
for file_name in file_list:
file_name=re.sub('[0-9]','',file_name)
new_file_list.append(file_name)
print (new_file_list)
rename_file()

Changing excel file to csv inside python script?

I'm trying to change a list of excel files to csvs before loading them into a pandas dataframe, but I'm unsure how I can convert them in my script. Csvkit and xlsx2csv seem to work for doing it from the command line, but when I try to start a subprocess like so
for filename in sorted_files:
file = subprocess.Popen("in2csv filename", stdout=subprocess.PIPE)
print file.stdout
dataframe = pd.read_csv(file)
I'm getting the error
IOError: Expected file path name or file-like object, got type
schema must not be null when format is "fixed"
Is it possible to get the output from the subprocess and pipe that to a dataframe? Any help greatly appreciated!
Although it has been so long since the question was made, I had the same issue and this is the way it was implemented inside a python script:
Could only execute Xlsx2csv with sheetid parameter. In order to get sheet names and ids, get_sheet_details was used.
csvfrmxlsx creates csv files for each sheet in csv folder under parent directory.
import pandas as pd
from pathlib import Path
def get_sheet_details(filename):
import xmltodict
import shutil
import zipfile
sheets = []
# Make a temporary directory with the file name
directory_to_extract_to = (filename.with_suffix(''))
directory_to_extract_to.mkdir(parents=True, exist_ok=True)
# Extract the xlsx file as it is just a zip file
zip_ref = zipfile.ZipFile(filename, 'r')
zip_ref.extractall(directory_to_extract_to)
zip_ref.close()
# Open the workbook.xml which is very light and only has meta data, get sheets from it
path_to_workbook = directory_to_extract_to / 'xl' / 'workbook.xml'
with open(path_to_workbook, 'r') as f:
xml = f.read()
dictionary = xmltodict.parse(xml)
for sheet in dictionary['workbook']['sheets']['sheet']:
sheet_details = {
'id': sheet['#sheetId'], # can be sheetId for some versions
'name': sheet['#name'] # can be name
}
sheets.append(sheet_details)
# Delete the extracted files directory
shutil.rmtree(directory_to_extract_to)
return sheets
def csvfrmxlsx(xlsxfl, df): # create csv files in csv folder on parent directory
from xlsx2csv import Xlsx2csv
(xlsxfl.parent / 'csv').mkdir(parents=True, exist_ok=True)
for index, row in df.iterrows():
shnum = row['id']
shnph = xlsxfl.parent / 'csv' / Path(row['name'] + '.csv') # path for converted csv file
Xlsx2csv(str(xlsxfl), outputencoding="utf-8").convert(str(shnph), sheetid=int(shnum))
return
pthfnc = 'c:/xlsx/'
wrkfl = 'my.xlsx'
xls_file = Path(pthfnc + wrkfl)
sheetsdic = get_sheet_details(xls_file) # dictionary with sheet names and ids without opening xlsx file
df = pd.DataFrame.from_dict(sheetsdic)
csvfrmxlsx(xls_file, df) # df with sheets to be converted

Resources