How convert epoch time to string? - python-3.x

I have a data array 10000*3 that need to save as a csv file.Below is my pseudo code, but I don't know how to achive it only with numpy(not pandas).Can someone know how to do?
import numpy as np
import time
arr = np.random.randn(10000, 3)
# That need arr[0,0] = time.time(), arr[i, 0] = time.time() + i
arr[:,0] = time.time() + 1
# And after that, I need to column 1 to datatime string(like "2021-02-12 12:12:12")
arr[:,0] need to apply `time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(item))` this function.
# After that, the data type is, column1:str, column2:float, column3:float and to save it as a csv file.

This can be done with import csv (pre-installed with python default packages). See this tutorial for examples: https://www.pythontutorial.net/python-basics/python-write-csv-file/
with open('countries.csv', 'w', encoding='UTF8') as f:
writer = csv.writer(f)
# write the header
writer.writerow(header)
# write the data
for row in arr:
writer.writerow(row)

Related

How to convert the 50000 txt file into csv

I have many text files. I tried to convert the txt files into a single CSV file, but it is taking a huge time. I put the code on run mode at night and I slept, it processed only 4500 files, but still morning it is running.
There is any way to fast way to convert the text files into csv?
Here is my code:
import pandas as pd
import os
import glob
from tqdm import tqdm
# create empty dataframe
csvout = pd.DataFrame(columns =["ID","Delivery_person_ID" ,"Delivery_person_Age" ,"Delivery_person_Ratings","Restaurant_latitude","Restaurant_longitude","Delivery_location_latitude","Delivery_location_longitude","Order_Date","Time_Orderd","Time_Order_picked","Weather conditions","Road_traffic_density","Vehicle_condition","Type_of_order","Type_of_vehicle", "multiple_deliveries","Festival","City","Time_taken (min)"])
# get list of files
file_list = glob.glob(os.path.join(os.getcwd(), "train/", "*.txt"))
for filename in tqdm(file_list):
# next file/record
mydict = {}
with open(filename) as datafile:
# read each line and split on " " space
for line in tqdm(datafile):
# Note: partition result in 3 string parts, "key", " ", "value"
# array slice third parameter [::2] means steps=+2
# so only take 1st and 3rd item
name, var = line.partition(" ")[::2]
mydict[name.strip()] = var.strip()
# put dictionary in dataframe
csvout = csvout.append(mydict, ignore_index=True)
# write to csv
csvout.to_csv("train.csv", sep=";", index=False)
Here is my example text file.
ID 0xb379
Delivery_person_ID BANGRES18DEL02
Delivery_person_Age 34.000000
Delivery_person_Ratings 4.500000
Restaurant_latitude 12.913041
Restaurant_longitude 77.683237
Delivery_location_latitude 13.043041
Delivery_location_longitude 77.813237
Order_Date 25-03-2022
Time_Orderd 19:45
Time_Order_picked 19:50
Weather conditions Stormy
Road_traffic_density Jam
Vehicle_condition 2
Type_of_order Snack
Type_of_vehicle scooter
multiple_deliveries 1.000000
Festival No
City Metropolitian
Time_taken (min) 33.000000
CSV is a very simple data format for which you don't need any sophisticated tools to handle. Just text and separators.
In your hopefully simple case there is no need to use pandas and dictionaries.
Except your datafiles are corrupt missing some columns or having some additional columns to skip. But even in this case you can handle such issues better within your own code so you have more control over it and are able to get results within seconds.
Assuming your datafiles are not corrupt having all columns in the right order with no missing columns or having additional ones (so you can rely on their proper formatting), just try this code:
from time import perf_counter as T
sT = T()
filesProcessed = 0
columns =["ID","Delivery_person_ID" ,"Delivery_person_Age" ,"Delivery_person_Ratings","Restaurant_latitude","Restaurant_longitude","Delivery_location_latitude","Delivery_location_longitude","Order_Date","Time_Orderd","Time_Order_picked","Weather conditions","Road_traffic_density","Vehicle_condition","Type_of_order","Type_of_vehicle", "multiple_deliveries","Festival","City","Time_taken (min)"]
import glob, os
file_list = glob.glob(os.path.join(os.getcwd(), "train/", "*.txt"))
csv_lines = []
csv_line_counter = 0
for filename in file_list:
filesProcessed += 1
with open(filename) as datafile:
csv_line = ""
for line in datafile.read().splitlines():
# print(line)
var = line.partition(" ")[-1]
csv_line += var.strip() + ';'
csv_lines.append(str(csv_line_counter)+';'+csv_line[:-1])
csv_line_counter += 1
with open("train.csv", "w") as csvfile:
csvfile.write(';'+';'.join(columns)+'\n')
csvfile.write('\n'.join(csv_lines))
eT = T()
print(f'> {filesProcessed=}, {(eT-sT)=:8.6f}')
I guess you will get the result in a speed beyond your expectations (in seconds, not minutes or hours)
On my computer, estimating from processing time of 100 files the time required for 50.000 files will be about 3 seconds.
I could not replicate. I took the example data file and created 5000 copies of it. Then I ran your code using tqdm and without. The below shows without:
import time
import csv
import os
import glob
import pandas as pd
from tqdm import tqdm
csvout = pd.DataFrame(columns =["ID","Delivery_person_ID" ,"Delivery_person_Age" ,"Delivery_person_Ratings","Restaurant_latitude","Restaurant_longitude","Delivery_location_latitude","Delivery_location_longitude","Order_Date","Time_Orderd","Time_Order_picked","Weather conditions","Road_traffic_density","Vehicle_condition","Type_of_order","Type_of_vehicle", "multiple_deliveries","Festival","City","Time_taken (min)"])
file_list = glob.glob(os.path.join(os.getcwd(), "sample_files/", "*.txt"))
t1 = time.time()
for filename in file_list:
# next file/record
mydict = {}
with open(filename) as datafile:
# read each line and split on " " space
for line in datafile:
# Note: partition result in 3 string parts, "key", " ", "value"
# array slice third parameter [::2] means steps=+2
# so only take 1st and 3rd item
name, var = line.partition(" ")[::2]
mydict[name.strip()] = var.strip()
# put dictionary in dataframe
csvout = csvout.append(mydict, ignore_index=True)
# write to csv
csvout.to_csv("train.csv", sep=";", index=False)
t2 = time.time()
print(t2-t1)
The times I got where:
tqdm 33 seconds
no tqdm 34 seconds
Then I ran using the csv module:
t1 = time.time()
with open('output.csv', 'a', newline='') as csv_file:
columns =["ID","Delivery_person_ID" ,"Delivery_person_Age" ,"Delivery_person_Ratings","Restaurant_latitude","Restaurant_longitude","Delivery_location_latitude","Delivery_location_longitude","Order_Date","Time_Orderd","Time_Order_picked","Weather conditions","Road_traffic_density","Vehicle_condition","Type_of_order","Type_of_vehicle", "multiple_deliveries","Festival","City","Time_taken (min)"]
mydict = {}
d_Writer = csv.DictWriter(csv_file, fieldnames=columns, delimiter=',')
d_Writer.writeheader()
for filename in file_list:
with open(filename) as datafile:
for line in datafile:
name, var = line.partition(" ")[::2]
mydict[name.strip()] = var.strip()
d_Writer.writerow(mydict)
t2 = time.time()
print(t2-t1)
The time for this was:
csv 0.32231569290161133 seconds.
Try it like this.
import glob
with open('my_file.csv', 'a') as csv_file:
for path in glob.glob('./*.txt'):
with open(path) as txt_file:
txt = txt_file.read() + '\n'
csv_file.write(txt)

csv_reader read N lines at a time

I have to read a CSV file N lines at a time.
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
for row in csv_reader:
print row
I know I can loop N times at a time, build a list of list and process that way.
But is there a simpler way of using csv_reader so that I read n lines at a time.
Hi I don't think that you'll be able to do that without a loop with csv package.
You should use pandas (pip install --user pandas) instead:
import pandas
df = pandas.read_csv('myfile.csv')
start = 0
step = 2 # Your 'N'
for i in range(0, len(df), step):
print(df[i:i+step])
start = i
Pandas has a chunksize option to their read_csv() method and I would probably explore that option.
If I was going to do it myself by hand, I would probably do something like:
import csv
def process_batch(rows):
print(rows)
def get_batch(reader, batch_size):
return [row for _ in range(batch_size) if (row:=next(reader, None))]
with open("data.csv", "r") as file_in:
reader = csv.reader(file_in)
while batch := get_batch(reader, 5):
process_batch(batch)

Import txt file and filter with space

I'm writing a script to track my orders from a website. I want to import the order# from a txt file and the script should repeat it self as long as there are ordernumbers.I wrote a code where the script imports this txt file and chooses a random ordernumber but the script puts all ordernumbers together and doesnt seperate them how can I fix this ?
this is my code:
f=open("Order#.txt", "r")
OrderNR = f.read()
words = OrderNR.split()
Repeat = len(words)
for i in range(Repeat):
randomlist = OrderNR
Orderrandom = random.choice(randomlist)
Mainlink = 'https://footlocker.narvar.com/footlocker/tracking/startrack?order_number=' + Orderrandom
Instead of using f.read(), try using f.readlines().
# Using readlines()
file1 = open('myfile.txt', 'r')
Lines = file1.readlines()
Try PANDAS
import pandas as pd
df = pd.read_csv('Order#.txt', delimiter='\t')
print(df)
you can see TXT file in table format

what is wrong with this Pandas and txt file code

I'm using pandas to open a CSV file that contains data from spotify, meanwhile, I have a txt file that contains various artists names from that CSV file. What I'm trying to do is get the value from each row of the txt and automatically search them in the function I've done.
import pandas as pd
import time
df = pd.read_csv("data.csv")
df = df[['artists', 'name', 'year']]
def buscarA():
start = time.time()
newdf = (df.loc[df['artists'].str.contains(art)])
stop = time.time()
tempo = (stop - start)
print (newdf)
e = ('{:.2f}'.format(tempo))
print (e)
with open("teste3.txt", "r") as f:
for row in f:
art = row
buscarA()
but the output is always the same:
Empty DataFrame
Columns: [artists, name, year]
Index: []
The problem here is that when you read the lines of your file in Python, it also gets the line break per row so that you have to strip it off.
Let's suppose that the first line of your teste3.txt file is "James Brown". It'd be read as "James Brown\n" and not recognized in the search.
Changing the last chunk of your code to:
with open("teste3.txt", "r") as f:
for row in f:
art = row.strip()
buscarA()
should work.

Get python to add serial nos to each entry as it is run

I am new to programming and probably there is an answer to my question somewhere like here, the closest i found after searching for days. Most of the info deals with existing csvs or hardcoding data. I am trying to make the program create data every time it runs and work on that so a little stumped here.
The Problem:
I can't seem to get python to attach serial nos to each entry when i run the program am making to log my study blocks. It has various fields following are two of them:
Date Time
12-03-2018 11:30
Following is the code snippet:
d= ''
while d == '':
d = input('Date:')
try:
valid_date = dt.strptime(d, '%Y-%m-%d')
except ValueError:
d = ''
print('Please input date in YYYY-MM-DD format.')
t= ''
while t == '':
t = input('Time:')
try:
valid_time = dt.strptime(t, '%H:%M')
except ValueError:
d = ''
print('Please input time in HH:MM format.')
header = csv.DictWriter(outfile, fieldnames= ['UID', 'Date', 'Time', 'Topic', 'Objective', 'Why', 'Summary'], delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL )
header.writeheader()
log_input = csv.writer(outfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
log_input.writerow([d, t, topic, objective, why, summary])
outfile.close()
df = pd.read_csv('E:\Coursera\HSU\python\pom_blocks_log.csv')
df = pd.read_csv('E:\pom_blocks_log.csv')
df = df.reset_index()
df.columns[0] = 'UID'
df['UID'] = df.index
print (df)
I get the following error when i run the program with the df block:
TypeError: Index does not support mutable operations
I new to python and don't really know how to work with data structures, so i am building small programs to learn. Any help is highly appreciated and apologies if this is a duplicate, please point me to the right direction.
So, i figured it out. Following is the process i followed:
I save the CSV file using the csv module.
I load the CSV file in pandas as dataframe.
What this does is, it allows me to append user entries to the CSV every time the program is run and then i can load it as a dataframe and use pandas to manipulate the data accordingly. Then i added a generator to clean the lines off the delimiter character ',' so that it could be loaded as a dataframe for string columns where ',' is accepted as a valid input. Maybe this is a round about approach but, it works.
Following is the code:
import csv
from csv import reader
from datetime import datetime
import pandas as pd
import numpy as np
with open(r'E:\Coursera\HSU\08_programming\trLog_df.csv','a', encoding='utf-8') as csvfile:
# Date
d = ''#input("Date:")
while d == '':
d = input('Date: ')
try:
valid_date = datetime.strptime(d, '%Y-%m-%d')
except ValueError:
d = ''
print("Incorrect data format, should be YYYY-MM-DD")
# Time
t = ''#input("Date:")
while t == '':
t = input('Time: ')
try:
valid_date = datetime.strptime(t, '%H:%M')
except ValueError:
t = ''
print("Incorrect data format, should be HH:MM")
log_input = csv.writer(csvfile, delimiter= ',',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
log_input.writerow([d, t])
# Function to clean lines off the delimter ','
def merge_last(file_name, merge_after_col=7, skip_lines=0):
with open(file_name, 'r') as fp:
for i, line in enumerate(fp):
if i < 2:
continue
spl = line.strip().split(',')
yield (*spl[:merge_after_col], ','.join(spl[merge_after_col:2]))
# Generator to clean the lines
gen = merge_last(r'E:\Coursera\HSU\08_programming\trLog_df.csv', 1)
# get the column names
header = next(gen)
# create the data frame
df = pd.DataFrame(gen, columns=header)
df.head()
print(df)
If anybody has a better solution, it would be enlightening to know how to do it with efficiency and elegance.
Thank you for reading.

Resources