Refining Python code for repeated use (Xlsx To Txt file and to WordCloud ) - python-3.x

I've been self-learning Python for the past Month( 0 coding experience python is my first coding language) and have finally written my first work related usable code , i am trying to refine this code for repeated use , as it converts a comment based xlx data to txt 'string type' file and finally a wordcloud; you can find below the workable code:
how the code works:
step1. xlsx file = 4 column excel worksheet
step2. python extracts all column 'B'
step3. converts it into 'Str' Format , removes spaces & converts into txt file
step4. wordcloud removes words using stopWords,
Step5. generates wordcloud according to the format
I would Like to refine it in a way :
Changing File Directory Through a simple step instead of multiple copy and pasting of directory name ( skip manual changing of all file directories )
txt file's name creation is based on xlsx file's name ( so i don't have to key in manually every time)
if anyone has a better way of refining please do let me know , i am very new to this so if you need any other information to clarify any info , lemme know
Any help would be greatly appreciated, Thank you all in advance
import openpyxl as xl
import wordcloud
from wordcloud import WordCloud,STOPWORDS
from matplotlib.pyplot import imread
import jieba
import pandas as pd
# opening the source excel file ( repeated steps needed for every different document)
filename = "C:\\Users\\shakesmilk\\Desktop\\staub\\staub天猫商品评论.xlsx"
wb1 = xl.load_workbook(filename)
ws1 = wb1.worksheets[0]
# opening the destination excel file ( repeated steps needed for every different document)
filename1 = "C:\\Users\\shakesmilk\\Desktop\\staub\\staub天猫商品评论.xlsx"
wb2 = xl.load_workbook(filename1)
wb2.create_sheet('Sheet2')
ws2 = wb2.worksheets[1]
# calculate total number of rows and
# columns in source excel file
mr = ws1.max_row
mc = ws1.max_column
minr= ws2.min_row
# # copying the cell values from source
# # excel file to destination excel file
for i in range(1, mr + 1):
for j in range(0, mc + 1):
# reading cell value from source excel file
c = ws1.cell(row=i+1, column=2)
# writing the read value to destination excel file
ws2.cell(row=i+1, column=2).value = c.value
# # #deleting first empty row/ column
ws2.delete_cols(1)
#saving the destination excel file
wb2.save(str(filename1))
# #converting sheet 2 with pandas to txt file
df = pd.read_excel(filename,sheet_name=1)
with open("C:\\Users\\shakesmilk\\Desktop\\staub\\file.txt", mode='w',encoding='utf-8') as outfile:
df.to_string(outfile,header = None ,index = None)
#open read & remove spaces from txt file
commentfiletxt= "C:\\Users\\shakesmilk\\Desktop\\staub\\file.txt"
with open(commentfiletxt, 'r' , encoding='utf-8') as f:
lines = f.readlines()
# # remove spaces
lines = [line.replace(' ', '') for line in lines]
# # finally, write lines in the file
with open(commentfiletxt,'w', encoding='utf-8') as f :
f.writelines(lines)
# txt file generated > next to create wordcloud
#wordcloud start
#remove words from wordcloud
stopwords= set(STOPWORDS)
stopwords.update(['此用户没有填写评论', 'hellip','zwj','其他特色','还没用','非常喜欢','产品功能','没有用'])
mask = imread('moon.jpg')
with open(commentfiletxt, 'r',encoding='utf-8') as file:
text = file.read()
words = jieba.lcut(text) # 精确分词
newtxt = ' '.join(words) # 空格拼接
wd = wordcloud.WordCloud(stopwords=stopwords,\
font_path="MSYH.TTC",\
background_color="white", \
width=800, \
height=300, \
max_words=500, \
max_font_size=200, \
mask = mask, \
).generate(text)
# save picture
txt = open(commentfiletxt, mode='r', encoding='utf-8')
# save picture
wd.to_file('staub2.png')

i continued reading "Learning Python 5th Edition" and apparently functions are a good way to make codes reusable ; guess no one is interested in noob codes, but i'm sure there are a lot of beginners out there trying to refine their codes so i'm answering my question as i progress through the book , hopefully this helps out those in need. p.s currently reading about classes , im guesssing i could transform this code further but for now for those in need , this is my 'def' example:
import openpyxl as xl
import wordcloud
from wordcloud import WordCloud,STOPWORDS
from matplotlib.pyplot import imread
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import jieba
import pandas as pd
def open_excel(filename):
global wb1,ws1,filenametxt
wb1 =xl.load_workbook(filename)
ws1 = wb1.worksheets[0]
filenametxt = filename
print('Loading WorkBook Completed')
def create_sheet(filename1):
global wb2,ws2
wb2 = xl.load_workbook(filename1)
wb2.create_sheet('Sheet2')
ws2 = wb2.worksheets[1]
print('Sheet 2 Created')
mr = ws1.max_row
mc = ws1.max_column
minr = ws2.min_row
for i in range(1, mr + 1):
for j in range(0, mc + 1):
# reading cell value from source excel file
c = ws1.cell(row=i + 1, column=2)
ws2.cell(row=i + 1, column=2).value = c.value
wb2.save(filename1)
print("Data Extracted To 'Column B'" )
ws2.delete_cols(1)
print('Empty Space in Column 1 Deleted')
wb2.save(filename1)
def create_txtf(tfile):
global df
df = pd.read_excel(filenametxt, sheet_name=1)
with open(tfile, mode='w', encoding='utf-8') as outfile:
df.to_string(outfile, header=None, index=None)
print('txt file created as file.txt')
def convert_remove(tfile1):
#open read & remove spaces from txt file
global commentfiletxt
commentfiletxt = tfile1
with open(commentfiletxt, 'r', encoding='utf-8') as f:
lines = f.readlines()
# # remove spaces
lines = [line.replace(' ', '') for line in lines]
print('Empty Spaces ' ' are removed' )
# # finally, write lines in the file
with open(commentfiletxt, 'w', encoding='utf-8') as f:
f.writelines(lines)
print('Data is written correctly without spaces')
return(tfile1)
def wordcloudpic(picname,maskpathn):
stopwords = set(STOPWORDS)
stopwords.update(['此用户没有填写评论', 'hellip','zwj','其他特色','还没用','非常喜欢','产品功能','没有用','东西收到了','S','sode','c','s左右','u','middot','u','theta','rdquo','ldquo','ec','ok','好评','不错','很好','满意','好用','老板大气','好',\
'nbsp'])
mask = imread(maskpathn)
mask = mask.astype(np.uint8)
with open(commentfiletxt, 'r',encoding='utf-8') as file:
text = file.read()
words = jieba.lcut(text) # 精确分词
newtxt = ' '.join(words) # 空格拼接
wd = wordcloud.WordCloud(stopwords=stopwords,\
font_path="MSYH.TTC",\
background_color="white", \
width=800, \
height=300, \
max_words=500, \
max_font_size=200, \
mask = mask, \
).generate(text)
txt = open(commentfiletxt, mode='r', encoding='utf-8')
# save picture
wd.to_file(picname)
if __name__ == "__main__":
open_excel("C:\\Users\\shakesmilk\\Desktop\\testtest\\test天猫商品评.xlsx")
create_sheet("C:\\Users\\shakesmilk\\Desktop\\testtest\\test天猫商品评.xlsx")
create_txtf("C:\\Users\\shakesmilk\\Desktop\\testtest\\file.txt")
convert_remove("C:\\Users\\shakesmilk\\Desktop\\testtest\\file.txt")
wordcloudpic('test.png','bubble.jpg')

Related

How to convert the 50000 txt file into csv

I have many text files. I tried to convert the txt files into a single CSV file, but it is taking a huge time. I put the code on run mode at night and I slept, it processed only 4500 files, but still morning it is running.
There is any way to fast way to convert the text files into csv?
Here is my code:
import pandas as pd
import os
import glob
from tqdm import tqdm
# create empty dataframe
csvout = pd.DataFrame(columns =["ID","Delivery_person_ID" ,"Delivery_person_Age" ,"Delivery_person_Ratings","Restaurant_latitude","Restaurant_longitude","Delivery_location_latitude","Delivery_location_longitude","Order_Date","Time_Orderd","Time_Order_picked","Weather conditions","Road_traffic_density","Vehicle_condition","Type_of_order","Type_of_vehicle", "multiple_deliveries","Festival","City","Time_taken (min)"])
# get list of files
file_list = glob.glob(os.path.join(os.getcwd(), "train/", "*.txt"))
for filename in tqdm(file_list):
# next file/record
mydict = {}
with open(filename) as datafile:
# read each line and split on " " space
for line in tqdm(datafile):
# Note: partition result in 3 string parts, "key", " ", "value"
# array slice third parameter [::2] means steps=+2
# so only take 1st and 3rd item
name, var = line.partition(" ")[::2]
mydict[name.strip()] = var.strip()
# put dictionary in dataframe
csvout = csvout.append(mydict, ignore_index=True)
# write to csv
csvout.to_csv("train.csv", sep=";", index=False)
Here is my example text file.
ID 0xb379
Delivery_person_ID BANGRES18DEL02
Delivery_person_Age 34.000000
Delivery_person_Ratings 4.500000
Restaurant_latitude 12.913041
Restaurant_longitude 77.683237
Delivery_location_latitude 13.043041
Delivery_location_longitude 77.813237
Order_Date 25-03-2022
Time_Orderd 19:45
Time_Order_picked 19:50
Weather conditions Stormy
Road_traffic_density Jam
Vehicle_condition 2
Type_of_order Snack
Type_of_vehicle scooter
multiple_deliveries 1.000000
Festival No
City Metropolitian
Time_taken (min) 33.000000
CSV is a very simple data format for which you don't need any sophisticated tools to handle. Just text and separators.
In your hopefully simple case there is no need to use pandas and dictionaries.
Except your datafiles are corrupt missing some columns or having some additional columns to skip. But even in this case you can handle such issues better within your own code so you have more control over it and are able to get results within seconds.
Assuming your datafiles are not corrupt having all columns in the right order with no missing columns or having additional ones (so you can rely on their proper formatting), just try this code:
from time import perf_counter as T
sT = T()
filesProcessed = 0
columns =["ID","Delivery_person_ID" ,"Delivery_person_Age" ,"Delivery_person_Ratings","Restaurant_latitude","Restaurant_longitude","Delivery_location_latitude","Delivery_location_longitude","Order_Date","Time_Orderd","Time_Order_picked","Weather conditions","Road_traffic_density","Vehicle_condition","Type_of_order","Type_of_vehicle", "multiple_deliveries","Festival","City","Time_taken (min)"]
import glob, os
file_list = glob.glob(os.path.join(os.getcwd(), "train/", "*.txt"))
csv_lines = []
csv_line_counter = 0
for filename in file_list:
filesProcessed += 1
with open(filename) as datafile:
csv_line = ""
for line in datafile.read().splitlines():
# print(line)
var = line.partition(" ")[-1]
csv_line += var.strip() + ';'
csv_lines.append(str(csv_line_counter)+';'+csv_line[:-1])
csv_line_counter += 1
with open("train.csv", "w") as csvfile:
csvfile.write(';'+';'.join(columns)+'\n')
csvfile.write('\n'.join(csv_lines))
eT = T()
print(f'> {filesProcessed=}, {(eT-sT)=:8.6f}')
I guess you will get the result in a speed beyond your expectations (in seconds, not minutes or hours)
On my computer, estimating from processing time of 100 files the time required for 50.000 files will be about 3 seconds.
I could not replicate. I took the example data file and created 5000 copies of it. Then I ran your code using tqdm and without. The below shows without:
import time
import csv
import os
import glob
import pandas as pd
from tqdm import tqdm
csvout = pd.DataFrame(columns =["ID","Delivery_person_ID" ,"Delivery_person_Age" ,"Delivery_person_Ratings","Restaurant_latitude","Restaurant_longitude","Delivery_location_latitude","Delivery_location_longitude","Order_Date","Time_Orderd","Time_Order_picked","Weather conditions","Road_traffic_density","Vehicle_condition","Type_of_order","Type_of_vehicle", "multiple_deliveries","Festival","City","Time_taken (min)"])
file_list = glob.glob(os.path.join(os.getcwd(), "sample_files/", "*.txt"))
t1 = time.time()
for filename in file_list:
# next file/record
mydict = {}
with open(filename) as datafile:
# read each line and split on " " space
for line in datafile:
# Note: partition result in 3 string parts, "key", " ", "value"
# array slice third parameter [::2] means steps=+2
# so only take 1st and 3rd item
name, var = line.partition(" ")[::2]
mydict[name.strip()] = var.strip()
# put dictionary in dataframe
csvout = csvout.append(mydict, ignore_index=True)
# write to csv
csvout.to_csv("train.csv", sep=";", index=False)
t2 = time.time()
print(t2-t1)
The times I got where:
tqdm 33 seconds
no tqdm 34 seconds
Then I ran using the csv module:
t1 = time.time()
with open('output.csv', 'a', newline='') as csv_file:
columns =["ID","Delivery_person_ID" ,"Delivery_person_Age" ,"Delivery_person_Ratings","Restaurant_latitude","Restaurant_longitude","Delivery_location_latitude","Delivery_location_longitude","Order_Date","Time_Orderd","Time_Order_picked","Weather conditions","Road_traffic_density","Vehicle_condition","Type_of_order","Type_of_vehicle", "multiple_deliveries","Festival","City","Time_taken (min)"]
mydict = {}
d_Writer = csv.DictWriter(csv_file, fieldnames=columns, delimiter=',')
d_Writer.writeheader()
for filename in file_list:
with open(filename) as datafile:
for line in datafile:
name, var = line.partition(" ")[::2]
mydict[name.strip()] = var.strip()
d_Writer.writerow(mydict)
t2 = time.time()
print(t2-t1)
The time for this was:
csv 0.32231569290161133 seconds.
Try it like this.
import glob
with open('my_file.csv', 'a') as csv_file:
for path in glob.glob('./*.txt'):
with open(path) as txt_file:
txt = txt_file.read() + '\n'
csv_file.write(txt)

In python, how to concatenate corresponding sheets in multiple excel files

How do I concatenate multiple xlsx files with the same sheet_names. For example,
I have 3 xlsx files, Rob_schedule.xlsx, Mike_schdule.xlsx and Jerome_schedule.xlsx.
Each file has the following sheet/tab names : home, office & school.
The code below generates the 3 xlsx files ( you can copy + paste and run to generate the excel files)
##############################Generating the data for Rob_schedule.xlsx########################
import pandas as pd
import numpy as np
df= {
'Date':[10232020,10242020,10252020,10262020],
'Class':['AP_Bio','AP_Chem','Physics','History'],
'Period':[3,1,2,4]}
school = pd.DataFrame(df,columns = ['Date','Class','Period'])
school
df2= {
'Date':[10232020,10242020,10252020,10262020],
'Meeting':['MQ1','MQ6','MQ2','MQ8'],
'Lunch':[1,1,1,3],
'code':['java','python','C','C++']}
office = pd.DataFrame(df2,columns = ['Date','Meeting','Lunch','code'])
office
df3= {
'cooking':['C','B','D','B'],
'Laundry':['color','white','White','color'],
'cleaning':['balcony','garage','restroom','bathroom']}
home = pd.DataFrame(df3,columns = ['cooking','Laundry','cleaning'])
home
import pandas as pd
#initialze the excel writer
writer = pd.ExcelWriter('Rob_schedule.xlsx', engine='xlsxwriter')
#store your dataframes in a dict, where the key is the sheet name you want
frames = {'home':home, 'office':office,
'school':school}
#now loop thru and put each on a specific sheet
for sheet, frame in frames.items():
frame.to_excel(writer, sheet_name = sheet,index = False)
#critical last step
writer.save()
################################ generating Mike_schedule.xlsx###################################
import pandas as pd
import numpy as np
df= {
'Date':[10232020,10242020,10252020,10262020],
'Class':['AP_Bio','AP_Chem','Physics','History'],
'Period':[3,1,2,4]}
school = pd.DataFrame(df,columns = ['Date','Class','Period'])
school
df2= {
'Date':[10232020,10242020,10252020,10262020],
'Meeting':['MQ1','MQ2','MQ4','MQ5'],
'Lunch':[1,1,1,3],
'code':['javascript','R','C','C++']}
office = pd.DataFrame(df2,columns = ['Date','Meeting','Lunch','code'])
office
df3= {
'cooking':['A','B','D','B'],
'Laundry':['color','white','white','color'],
'cleaning':['patio','garage','living_room','bathroom']}
home = pd.DataFrame(df3,columns = ['cooking','Laundry','cleaning'])
home
#initialze the excel writer
writer = pd.ExcelWriter('Mike_schedule.xlsx', engine='xlsxwriter')
#store your dataframes in a dict, where the key is the sheet name you want
frames = {'home':home, 'office':office,
'school':school}
#now loop thru and put each on a specific sheet
for sheet, frame in frames.items(): # .use .items for python 3.X
frame.to_excel(writer, sheet_name = sheet,index = False)
#critical last step
writer.save()
######################### Generate Jerome schedule###########################################
df= {
'Date':[10232020,10242020,10252020,10262020],
'Class':['French','Math','Physics','History'],
'Period':[3,1,2,4]}
school = pd.DataFrame(df,columns = ['Date','Class','Period'])
school
df2= {
'Date':[10232020,10242020,10252020,10262020],
'Meeting':['MQ1','MQ2','MQ4','MQ5'],
'Lunch':[1,1,1,3],
'code':['javascript','python','R','C++']}
office = pd.DataFrame(df2,columns = ['Date','Meeting','Lunch','code'])
office
df3= {
'cooking':['X','B','D','C'],
'Laundry':['color','white','white','color'],
'cleaning':['patio','garage','living_room','bathroom']}
home = pd.DataFrame(df3,columns = ['cooking','Laundry','cleaning'])
home
import pandas as pd
#initialze the excel writer
writer = pd.ExcelWriter('Jerome_schedule.xlsx', engine='xlsxwriter')
#store your dataframes in a dict, where the key is the sheet name you want
frames = {'home':home, 'office':office,
'school':school}
#now loop thru and put each on a specific sheet
for sheet, frame in frames.items(): # .use .items for python 3.X
frame.to_excel(writer, sheet_name = sheet,index = False)
#critical last step
writer.save()
I want to
concatenate the corresponding sheets/tabs :home, office, and school for Rob_schedule.xlsx,Mike_schedule.xlsx & Jerome_schedule.xlsx
export the concatenated dataframes as family_schedule.xlsx with home, office and school tabs
My attempt:
# This code concatenates all the tabs into one tab, but what I want is to concatenate all by their corresponding sheet/tab names
import pandas as pd
path = os.chdir(r'mypath\\')
files = os.listdir(path)
files
# pull files with `.xlsx` extension
excel_files = [file for file in files if '.xlsx' in file]
excel_files
def create_df_from_excel(file_name):
file = pd.ExcelFile(file_name)
names = file.sheet_names
return pd.concat([file.parse(name) for name in names])
df = pd.concat(
[create_df_from_excel(xl) for xl in excel_files]
)
# save the data frame
writer = pd.ExcelWriter('family_reschedule.xlsx')
df.to_excel(writer, '')
writer.save()
I would iterate over each file, and then over each worksheet, adding each sheet to a different list based on the sheet name.
Then you'll have a structure like...
{
'sheet1': [df_file1_sheet1, df_file2_sheet1, df_file3_sheet1],
'sheet2': [df_file1_sheet2, df_file2_sheet2, df_file3_sheet2],
'sheet3': [df_file1_sheet3, df_file2_sheet3, df_file3_sheet3],
}
Then concatenate each list in to a single dataframe, them write the three dataframes to an excel file.
# This part is just your own code, I've added it here because you
# couldn't figure out where `excel_files` came from
#################################################################
import os
import pandas as pd
path = os.chdir(r'mypath\\')
files = os.listdir(path)
files
# pull files with `.xlsx` extension
excel_files = [file for file in files if '.xlsx' in file]
excel_files
# This part is my actual answer
###############################
from collections import defaultdict
worksheet_lists = defaultdict(list)
for file_name in excel_files:
workbook = pd.ExcelFile(file_name)
for sheet_name in workbook.sheet_names:
worksheet = workbook.parse(sheet_name)
worksheet['source'] = file_name
worksheet_lists[sheet_name].append(worksheet)
worksheets = {
sheet_name: pd.concat(sheet_list)
for (sheet_name, sheet_list)
in worksheet_lists.items()
}
writer = pd.ExcelWriter('family_reschedule.xlsx')
for sheet_name, df in worksheets.items():
df.to_excel(writer, sheet_name=sheet_name, index=False)
writer.save()
Consider building a list of concatenated data frames with list/dict comprehensions by running an outer iteration across sheet names and inner iteration across workbooks:
import pandas as pd
path = "/path/to/workbooks"
workbooks = [f for f in os.listdir(path) if f.endswith(".xlsx")]
sheets = ["home", "office", "school"]
df_dicts = {
sh: pd.concat(
[pd.read_excel(os.path.join(path, wb), sheet_name=sh)
for wb in workbooks]
)
for sh in sheets
}
Then, export to single file:
with pd.ExcelWriter('family_reschedule.xlsx') as writer:
for sh, df in df_dict.items():
df.to_excel(writer, sheet_name=sh, index=False)
writer.save()

Import txt file and filter with space

I'm writing a script to track my orders from a website. I want to import the order# from a txt file and the script should repeat it self as long as there are ordernumbers.I wrote a code where the script imports this txt file and chooses a random ordernumber but the script puts all ordernumbers together and doesnt seperate them how can I fix this ?
this is my code:
f=open("Order#.txt", "r")
OrderNR = f.read()
words = OrderNR.split()
Repeat = len(words)
for i in range(Repeat):
randomlist = OrderNR
Orderrandom = random.choice(randomlist)
Mainlink = 'https://footlocker.narvar.com/footlocker/tracking/startrack?order_number=' + Orderrandom
Instead of using f.read(), try using f.readlines().
# Using readlines()
file1 = open('myfile.txt', 'r')
Lines = file1.readlines()
Try PANDAS
import pandas as pd
df = pd.read_csv('Order#.txt', delimiter='\t')
print(df)
you can see TXT file in table format

Storing outputdata in CSV using python

I have extracted data from different excel sheets spread in different folders, I have organized the folders numerically from 2015 to 2019 and each folder has twelve subfolders (from 1 to 12) here's my code:
import os
from os import walk
import pandas as pd
path = r'C:\Users\Sarah\Desktop\IOMTest'
my_files = []
for (dirpath, dirnames, filenames) in walk(path):
my_files.extend([os.path.join(dirpath, fname) for fname in filenames])
all_sheets = []
for file_name in my_files:
#Display sheets names using pandas
pd.set_option('display.width',300)
mosul_file = file_name
xl = pd.ExcelFile(mosul_file)
mosul_df = xl.parse(0, header=[1], index_col=[0,1,2])
#Read Excel and Select columns
mosul_file = pd.read_excel(file_name, sheet_name = 0 ,
index_clo=None, na_values= ['NA'], usecols = "A, E, G, H , L , M" )
#Remove NaN values
data_mosul_df = mosul_file.apply (pd.to_numeric, errors='coerce')
data_mosul_df = mosul_file.dropna()
print(data_mosul_df)
then I saved the extracted columns in a csv file
def save_frames(frames, output_path):
for frame in frames:
frame.to_csv(output_path, mode='a+', header=False)
if __name__ == '__main__':
frames =[pd.DataFrame(data_mosul_df)]
save_frames(frames, r'C:\Users\Sarah\Desktop\tt\c.csv')
My problem is that when I open the csv file it seems that it doesn't store all the data but only the last excel sheet that it has read or sometimes the two last excel sheets. however, when I print my data inside the console (in Spyder) I see that all the data are treated
data_mosul_df = mosul_file.apply (pd.to_numeric, errors='coerce')
data_mosul_df = mosul_file.dropna()
print(data_mosul_df)
the picture below shows the output csv created. I am wondering if it is because from Column A to Column E the information are the same ? so that's why it overwrite ?
I would like to know how to modify the code so that it extract and store the data chronologically from folders (2015 to 2019) taking into accout subfolders (from 1 to 12) in each folder and how to create a csv that stores all the data ? thank you
Rewrite your loop:
for file_name in my_files:
#Display sheets names using pandas
pd.set_option('display.width',300)
mosul_file = file_name
xl = pd.ExcelFile(mosul_file)
mosul_df = xl.parse(0, header=[1], index_col=[0,1,2])
#Read Excel and Select columns
mosul_file = pd.read_excel(file_name, sheet_name = 0 ,
index_clo=None, na_values= ['NA'], usecols = "A, E, G, H , L , M" )
#Remove NaN values
data_mosul_df = mosul_file.apply (pd.to_numeric, errors='coerce')
data_mosul_df = mosul_file.dropna()
#Make a list of df's
all_sheets.append(data_mosul_df)
Rewrite your save_frames:
def save_frames(frames, output_path):
frames.to_csv(output_path, mode='a+', header=False)
Rewrite your main:
if __name__ == '__main__':
frames = pd.concat(all_sheets)
save_frames(frames, r'C:\Users\Sarah\Desktop\tt\c.csv')

Save CSV for every functioncall with another name

at the moment I am able to create one CSV file with all the content I get at once.
Now I would like to create a list where I have different names in it.
How can I produce for every functioncall a different CSV file name? I thought about looping a list but I just want a +1 iteration at each call. I thought about saving my state somehow and use it in next functioncall. Everytime I initialize my variable with 0 and so I don't get 1. I think I could do it with Python Function Parameter calls but I have no idea how to use it. Can someone give me a little tip or example? If there are better ideas (maybe my idea is totally bullshit), how to solve this, just help please.
The comments in the code shall represent my imagination.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from tenable.sc import SecurityCenter as SC
import os.path
import sys
import getpass
import csv
SC_HOST = '...'
def parse_entry(entry):
split_after_path = ''
ip = entry.get('ip', None)
pluginText = entry.get('pluginText', None)
if 'Path : ' in pluginText:
for line in pluginText.splitlines(0):
if 'Path : ' in line:
split_after_path_in_plugintext = line.split("Path : ",1)[1]
# place = ['place1', 'place2', 'place3', 'place4', 'place5']
# i = 0
# i = i+1
file_exists = os.path.isfile('testfile_path.csv')
# file_exists = os.path.isfile('testfile_path_'+place[i]+'.csv')
data = open('testfile_path.csv', 'a')
# data = open('testfile_path_'+place[i]+'.csv', 'a')
with data as csvfile:
header = ['IP Address', 'Path']
writer = csv.DictWriter(csvfile, lineterminator='\n', quoting=csv.QUOTE_NONNUMERIC, fieldnames=header)
if not file_exists:
writer.writeheader()
writer.writerow({'IP Address': ip, 'Path': split_after_path})
data.close()
def main():
sc_user = input('[<<] username: ')
sc_pass = getpass.getpass('[<<] password: ')
sc = SC(SC_HOST)
sc.login(sc_user, sc_pass)
# Query API for data
# asset = [12,13,14,25,29]
# i = 0
# assetid = asset[i]
# vuln = sc.analysis.vulns(('pluginID', '=', '25072')('asset','=','assetid'))
# i = i+1
vuln = sc.analysis.vulns(('pluginID', '=', '25072'),('asset','=','11'))
for entry in vuln:
parse_entry(entry)
sc.logout()
return 0
if __name__ == '__main__':
sys.exit(main())
The simplest and most obvious solution is to pass the full file path to your parse_entry function, ie:
def parse_entry(entry, filepath):
# ...
if 'Path : ' in pluginText:
for line in pluginText.splitlines(0):
if 'Path : ' in line:
# ...
file_exists = os.path.isfile(filepath)
with open(filepath, 'a') as csvfile:
# ...
Then in main() use enumerate() to build sequential filenames:
def main():
# ...
for i, entry in enumerate(vuln):
path = "'testfile_path{}.csv".format(i)
parse_entry(entry, path)
You can use a function attribute to keep track of the number of times the function has been called.
def parse_entry(entry):
parse_entry.i += 1
# outside the function you have to initialize the attribute
parse_entry.i = 0
Or you can look at other ways to initialize the function attribute in this post.
Alternatively, you can use glob to get the current number of files.
from glob import glob
i = len(glob('testfile_path_*.csv'))

Resources