Dynamically insert file name and path for saving csv in pandas - python-3.x

I need to dynamically save my pandas data frame. I have successfully managed to output a CSV file with a static name using the following code:
export_csv = df.to_csv(r'static_name.csv', header=False , index=False)
But I have failed to make this work dynamically. With the code below, I expect to get a file with the name passed into args.save_file_name and .csv suffix. However, I get no result.
import os
import argparse
import pandas as pd
parser = argparse.ArgumentParser()
parser.add_argument('file', help="this is the file you want to open")
parser.add_argument('save_file_name', help="the name of the file you want for the output CSV")
args = parser.parse_args()
print("file name:", args.file) # checking that this worked
...
# export csv
path = os.getcwd()
export_path = path + args.save_file_name + '.csv'
export_csv = df.to_csv(path_or_buf=export_path, header=False, index=False)

I think, that problem is in your export_path variable, that is not made right. The following code should do the job.
export_path = os.path.join(path, args.save_file_name + '.csv')

Related

Loop over files

I have a series of files named file_0001.csv, file_0002.csv, ... file_1000.csv etc. I need to read them iteratively by creating a list of the filenames and as
import numpy as np
import pandas as pd
for fileName in files:
data = pd.read_csv("folder" + fileName)
data = data.values
How do I create the list of file names by checking the first and last file.
Thank you for your help
if you are trying to create the specific file names, you can loop over from 1 to 1000 and create filenames
import numpy as np
import pandas as pd
for i in range(1,1001):
num = str(i)
filename = "file_" + num.zfill(4) + ".csv"
#data = pd.read_csv("folder" + fileName)
#data = data.values
print filename
output last 10 lines:
file_0991.csv
file_0992.csv
file_0993.csv
file_0994.csv
file_0995.csv
file_0996.csv
file_0997.csv
file_0998.csv
file_0999.csv
file_1000.csv
You should use pathlib from the standard library. Then you can simply do
import pandas as pd
from pathlib import Path
# get file number assuming the name format "file_number.csv"
def get_file_number(file_path):
return int(file_path.stem.split("_")[1])
folder_path = Path("path/to/folder")
# sort files by file number
files = sorted(folder_path.glob("file_*.csv"), key=get_file_number)
for file in files:
print(file.name) # just to check
data = pd.read_csv(file)
# do something with data

pd.read_excel function returns empty dataframe for xlsx sheet

Below script is there to convert all xlsx files placed inside the folder to CSV files with the same name.
import os
import glob
import pandas as pd
#set working directory
os.chdir("C:/Users/piyush.upadhyay/Piyush/Tasks/Task-61(Script to convert excel to csv)/Files")
all_files = [i for i in glob.glob('*.{}'.format('xlsx'))]
print(all_files)
li = []
for filename in all_files:
try:
print(filename)
input('Going to read xlsx to csv')
outFileName = filename.split('.')[0]+'.csv'
data_xls = pd.read_excel(filename, engine = 'openpyxl')
print(data_xls)
input('Going to convert xlsx to csv')
data_xls.to_csv(outFileName, header=True, index=None)
input('Converted')
except Exception as e:
print ("Error Logged..")
print(e)
input('Enter to exit')
Code returns a warning at the time of reading xlsx file which says:
Issue: Above function returns an empty data frame when we print the data_xls variable. As soon as we save the same file manually with the same extension i.e. xlsx, the code successfully converts all excel files inside the folder into CSV files.
Issue is same as the link describes

Read all CSVs, run some code on those CSVs and then write new CSVs

I have a CSV file and I am running some Python to remove line breaks from the CSV.
import csv
with open('Jan2020.csv', 'r') as txtReader:
with open('new_jan2020.csv', 'w') as txtWriter:
for line in txtReader.readlines():
line = line.replace('\r', '')
txtWriter.write(line)
This works fine.
What I want to achieve is the following:
I have multiple CSV files in a folder: jan2020, feb2020, march2020, april2020, may2020
How would I loop through each file, remove line breaks like my above method and then output a new file for each where the name of the new file is the format: new_monthYear.csv?
So I would end up with a bunch of CSVs new_jan2020, new_feb2020, new_march2020, new_april2020, new_may2020
Thanks
You can list all files in a directory with os.listdir. I would also recommend to write your new files in another folder.
import os
import csv
# Folder name
path_input = '/path/folder/'
path_output = '/path/other/'
dirs = os.listdir( path_input )
# This would iterate over all the listed files
for file in dirs:
file_to_read = os.path.join( path_input, file )
file_to_write = os.path.join( path_output, 'new_' + file )
# Your code
with open(file_to_read, 'r') as txtReader:
with open(file_to_write, 'w') as txtWriter:
for line in txtReader.readlines():
line = line.replace('\r', '')
txtWriter.write(line)
Updated with the use of pandas:
import os
import csv
import pandas as pd
# Folder name
path_input = (r'your/path')
path_output = (r'your/path')
dirs = os.listdir( path_input )
# Iterate over all the listed files
df ={}
for file in dirs:
file_to_read = os.path.join( path_input, file )
file_to_write = os.path.join( path_output, 'new_' + file )
# Remove line breaks
df = pd.read_csv(file_to_read)
df2 = df.replace("\n","", regex=True)
df3 = df2.to_csv(file_to_write, index=False)

Rename a file/ remove numbers from a file name in python 3

My code runs well when I didn't ask to rename the file, just to print all file names without number, but when I did it and checked file instead of running it, it just doesn't work.
here's my code:
import os
import re
def rename_file():
file_list = os.listdir(r"C:\Users\Zoe.Zhao\Desktop\prank")
saved_path = os.getcwd()
os.chdir(r"C:\Users\Zoe.Zhao\Desktop\prank")
print (saved_path)
for file_name in file_list:
file_name = re.sub('[0-9]','',file_name)
print(file_name)
rename_file()
Here are some sreenshots:
before:
after:
You need to create a new list: see below:
import os
import re
def rename_file():
file_list = os.listdir(r"C:\Users\afareh\Dropbox\PROGRAMMING\test\prank\prank")
saved_path = os.getcwd()
os.chdir(r"C:\Users\afareh\Dropbox\PROGRAMMING\test\prank\prank")
new_file_list=[] # create an empty list for storing the names of the renamed files.
for file_name in file_list:
file_name=re.sub('[0-9]','',file_name)
new_file_list.append(file_name)
print (new_file_list)
rename_file()

Changing excel file to csv inside python script?

I'm trying to change a list of excel files to csvs before loading them into a pandas dataframe, but I'm unsure how I can convert them in my script. Csvkit and xlsx2csv seem to work for doing it from the command line, but when I try to start a subprocess like so
for filename in sorted_files:
file = subprocess.Popen("in2csv filename", stdout=subprocess.PIPE)
print file.stdout
dataframe = pd.read_csv(file)
I'm getting the error
IOError: Expected file path name or file-like object, got type
schema must not be null when format is "fixed"
Is it possible to get the output from the subprocess and pipe that to a dataframe? Any help greatly appreciated!
Although it has been so long since the question was made, I had the same issue and this is the way it was implemented inside a python script:
Could only execute Xlsx2csv with sheetid parameter. In order to get sheet names and ids, get_sheet_details was used.
csvfrmxlsx creates csv files for each sheet in csv folder under parent directory.
import pandas as pd
from pathlib import Path
def get_sheet_details(filename):
import xmltodict
import shutil
import zipfile
sheets = []
# Make a temporary directory with the file name
directory_to_extract_to = (filename.with_suffix(''))
directory_to_extract_to.mkdir(parents=True, exist_ok=True)
# Extract the xlsx file as it is just a zip file
zip_ref = zipfile.ZipFile(filename, 'r')
zip_ref.extractall(directory_to_extract_to)
zip_ref.close()
# Open the workbook.xml which is very light and only has meta data, get sheets from it
path_to_workbook = directory_to_extract_to / 'xl' / 'workbook.xml'
with open(path_to_workbook, 'r') as f:
xml = f.read()
dictionary = xmltodict.parse(xml)
for sheet in dictionary['workbook']['sheets']['sheet']:
sheet_details = {
'id': sheet['#sheetId'], # can be sheetId for some versions
'name': sheet['#name'] # can be name
}
sheets.append(sheet_details)
# Delete the extracted files directory
shutil.rmtree(directory_to_extract_to)
return sheets
def csvfrmxlsx(xlsxfl, df): # create csv files in csv folder on parent directory
from xlsx2csv import Xlsx2csv
(xlsxfl.parent / 'csv').mkdir(parents=True, exist_ok=True)
for index, row in df.iterrows():
shnum = row['id']
shnph = xlsxfl.parent / 'csv' / Path(row['name'] + '.csv') # path for converted csv file
Xlsx2csv(str(xlsxfl), outputencoding="utf-8").convert(str(shnph), sheetid=int(shnum))
return
pthfnc = 'c:/xlsx/'
wrkfl = 'my.xlsx'
xls_file = Path(pthfnc + wrkfl)
sheetsdic = get_sheet_details(xls_file) # dictionary with sheet names and ids without opening xlsx file
df = pd.DataFrame.from_dict(sheetsdic)
csvfrmxlsx(xls_file, df) # df with sheets to be converted

Resources