get multiple colums into text file - python-3.x

I have a CSV file and I want to convert it to a text file based on the first column which is the ids. and then each file contain multiple columns. for example
file.csv
id val1 val 2 val3
1 50 52 60
2 45 84 96
and etc.
here is my code:
dir_name = '/Users/user/My Documents/test/'
with io.open('file1.csv', 'rt',encoding='utf8') as f:
reader = csv.reader(f, delimiter=',')
next(reader)
xx = []
for row in reader:
with open(os.path.join(dir_name, row[0] + ".txt"),'a') as f2:
xx = row[1:2]
f2.write(xx +"\n")
so it should be:
1.text
50 52 60
2.text
45 84 96
but it only creates files without content.
can anyone help me?. Thanks in advance

There were a couple of issues:
It's actually a whitespace separated values file, not a comma-separated values file. So, you have to change the delimiter from ,. Also, the whitespace is repeated, so you can pass an additional flag to the csv module.
Some funkiness with the array indexing and conversion to string.
This program meets your requirements:
#!/usr/bin/python
import io
import csv
import os
dir_name = './'
with io.open('input.csv', 'rt',encoding='utf8') as f:
reader = csv.reader(f, skipinitialspace=True, delimiter=' ')
next(reader)
xx = []
for row in reader:
filename = os.path.join(dir_name, row[0])
with open(filename + ".txt", 'a') as f2:
xx = row[1:]
f2.write(" ".join(xx) +"\n")

Related

How to convert the 50000 txt file into csv

I have many text files. I tried to convert the txt files into a single CSV file, but it is taking a huge time. I put the code on run mode at night and I slept, it processed only 4500 files, but still morning it is running.
There is any way to fast way to convert the text files into csv?
Here is my code:
import pandas as pd
import os
import glob
from tqdm import tqdm
# create empty dataframe
csvout = pd.DataFrame(columns =["ID","Delivery_person_ID" ,"Delivery_person_Age" ,"Delivery_person_Ratings","Restaurant_latitude","Restaurant_longitude","Delivery_location_latitude","Delivery_location_longitude","Order_Date","Time_Orderd","Time_Order_picked","Weather conditions","Road_traffic_density","Vehicle_condition","Type_of_order","Type_of_vehicle", "multiple_deliveries","Festival","City","Time_taken (min)"])
# get list of files
file_list = glob.glob(os.path.join(os.getcwd(), "train/", "*.txt"))
for filename in tqdm(file_list):
# next file/record
mydict = {}
with open(filename) as datafile:
# read each line and split on " " space
for line in tqdm(datafile):
# Note: partition result in 3 string parts, "key", " ", "value"
# array slice third parameter [::2] means steps=+2
# so only take 1st and 3rd item
name, var = line.partition(" ")[::2]
mydict[name.strip()] = var.strip()
# put dictionary in dataframe
csvout = csvout.append(mydict, ignore_index=True)
# write to csv
csvout.to_csv("train.csv", sep=";", index=False)
Here is my example text file.
ID 0xb379
Delivery_person_ID BANGRES18DEL02
Delivery_person_Age 34.000000
Delivery_person_Ratings 4.500000
Restaurant_latitude 12.913041
Restaurant_longitude 77.683237
Delivery_location_latitude 13.043041
Delivery_location_longitude 77.813237
Order_Date 25-03-2022
Time_Orderd 19:45
Time_Order_picked 19:50
Weather conditions Stormy
Road_traffic_density Jam
Vehicle_condition 2
Type_of_order Snack
Type_of_vehicle scooter
multiple_deliveries 1.000000
Festival No
City Metropolitian
Time_taken (min) 33.000000
CSV is a very simple data format for which you don't need any sophisticated tools to handle. Just text and separators.
In your hopefully simple case there is no need to use pandas and dictionaries.
Except your datafiles are corrupt missing some columns or having some additional columns to skip. But even in this case you can handle such issues better within your own code so you have more control over it and are able to get results within seconds.
Assuming your datafiles are not corrupt having all columns in the right order with no missing columns or having additional ones (so you can rely on their proper formatting), just try this code:
from time import perf_counter as T
sT = T()
filesProcessed = 0
columns =["ID","Delivery_person_ID" ,"Delivery_person_Age" ,"Delivery_person_Ratings","Restaurant_latitude","Restaurant_longitude","Delivery_location_latitude","Delivery_location_longitude","Order_Date","Time_Orderd","Time_Order_picked","Weather conditions","Road_traffic_density","Vehicle_condition","Type_of_order","Type_of_vehicle", "multiple_deliveries","Festival","City","Time_taken (min)"]
import glob, os
file_list = glob.glob(os.path.join(os.getcwd(), "train/", "*.txt"))
csv_lines = []
csv_line_counter = 0
for filename in file_list:
filesProcessed += 1
with open(filename) as datafile:
csv_line = ""
for line in datafile.read().splitlines():
# print(line)
var = line.partition(" ")[-1]
csv_line += var.strip() + ';'
csv_lines.append(str(csv_line_counter)+';'+csv_line[:-1])
csv_line_counter += 1
with open("train.csv", "w") as csvfile:
csvfile.write(';'+';'.join(columns)+'\n')
csvfile.write('\n'.join(csv_lines))
eT = T()
print(f'> {filesProcessed=}, {(eT-sT)=:8.6f}')
I guess you will get the result in a speed beyond your expectations (in seconds, not minutes or hours)
On my computer, estimating from processing time of 100 files the time required for 50.000 files will be about 3 seconds.
I could not replicate. I took the example data file and created 5000 copies of it. Then I ran your code using tqdm and without. The below shows without:
import time
import csv
import os
import glob
import pandas as pd
from tqdm import tqdm
csvout = pd.DataFrame(columns =["ID","Delivery_person_ID" ,"Delivery_person_Age" ,"Delivery_person_Ratings","Restaurant_latitude","Restaurant_longitude","Delivery_location_latitude","Delivery_location_longitude","Order_Date","Time_Orderd","Time_Order_picked","Weather conditions","Road_traffic_density","Vehicle_condition","Type_of_order","Type_of_vehicle", "multiple_deliveries","Festival","City","Time_taken (min)"])
file_list = glob.glob(os.path.join(os.getcwd(), "sample_files/", "*.txt"))
t1 = time.time()
for filename in file_list:
# next file/record
mydict = {}
with open(filename) as datafile:
# read each line and split on " " space
for line in datafile:
# Note: partition result in 3 string parts, "key", " ", "value"
# array slice third parameter [::2] means steps=+2
# so only take 1st and 3rd item
name, var = line.partition(" ")[::2]
mydict[name.strip()] = var.strip()
# put dictionary in dataframe
csvout = csvout.append(mydict, ignore_index=True)
# write to csv
csvout.to_csv("train.csv", sep=";", index=False)
t2 = time.time()
print(t2-t1)
The times I got where:
tqdm 33 seconds
no tqdm 34 seconds
Then I ran using the csv module:
t1 = time.time()
with open('output.csv', 'a', newline='') as csv_file:
columns =["ID","Delivery_person_ID" ,"Delivery_person_Age" ,"Delivery_person_Ratings","Restaurant_latitude","Restaurant_longitude","Delivery_location_latitude","Delivery_location_longitude","Order_Date","Time_Orderd","Time_Order_picked","Weather conditions","Road_traffic_density","Vehicle_condition","Type_of_order","Type_of_vehicle", "multiple_deliveries","Festival","City","Time_taken (min)"]
mydict = {}
d_Writer = csv.DictWriter(csv_file, fieldnames=columns, delimiter=',')
d_Writer.writeheader()
for filename in file_list:
with open(filename) as datafile:
for line in datafile:
name, var = line.partition(" ")[::2]
mydict[name.strip()] = var.strip()
d_Writer.writerow(mydict)
t2 = time.time()
print(t2-t1)
The time for this was:
csv 0.32231569290161133 seconds.
Try it like this.
import glob
with open('my_file.csv', 'a') as csv_file:
for path in glob.glob('./*.txt'):
with open(path) as txt_file:
txt = txt_file.read() + '\n'
csv_file.write(txt)

How to sum specific values from two different txt files in python

I have 2 txt files with names and scores. For example:
File 1 File 2 Desired Output
Name Score Name Score Name Score
Michael 20 Michael 30 Michael 50
Adrian 40 Adrian 50 Adrian 90
Jane 60 Jane 60
I want to sum scores with same names and print them. I tried to pair names and scores in two different dictionaries and after that merge the dictionaries. However, I can't keep same names with different scores. So, I'm stuck here. I've written something like following :
d1=dict()
d2=dict()
with open('data1.txt', "r") as f:
test = [i for line in f for i in line.split()]
i = 0
while i < len(test) - 1:
d1[test[i]] = test[i + 1]
i += 2
del d1['Name']
with open('data2.txt', "r") as f:
test = [i for line in f for i in line.split()]
i = 0
while i < len(test) - 1:
d2[test[i]] = test[i + 1]
i += 2
del d2['Name']
z = dict(d2.items() | d1.items())
Using a dictionary comprehension should get you what you are after. I have assumed the contents of the files are:
File1.txt:
Name Score
Michael 20
Adrian 40
Jane 60
File2.txt:
Name Score
Michael 30
Adrian 50
Then you can get a total as:
with open("file1.txt", "r") as file_in:
next(file_in) # skip header
file1_data = dict(row.split() for row in file_in if row)
with open("file2.txt", "r") as file_in:
next(file_in) # skip header
file2_data = dict(row.split() for row in file_in if row)
result = {
key: int(file1_data.get(key, 0)) + int(file2_data.get(key, 0))
for key
in set(file1_data).union(file2_data) # could also use file1_data.keys()
}
print(result)
This should give you a result like:
{'Michael': 50, 'Jane': 60, 'Adrian': 90}
Use defaultdict
from collections import defaultdict
name_scores = defaultdict(int)
files = ('data1.txt', 'data2.txt')
for file in files:
with open(file, 'r') as f:
for name, score in f.split():
name_scores[name] += int(score)
edit: You'll probably have to skip any header line and maybe clean up trailing white spaces, but the gist of it is above.

How to convert DataFrame to single row list

I want to convert DataFrames in file "text.txt" that contain 101 rows × 1 columns into 1 rows x 1 columns with separator (',')
I tried this code :
tweets_data = []
with open("text.txt", "r", encoding="utf8") as f:
for tweet in f:
ayu = tweet.rstrip('\n').split(',')
print(ayu)
I expected the output [{'text'},{'text'},....,{'text'}]
but the actual output is
[{'text\n'},
{'text\n'},
...
{'text\n'},]
Anyone can help me?
From what I can see the solution would be something like this:
with open("fileName.txt", "r", encoding="utf-8") as f:
listOfTweets = []
for tweet in f:
listOfTweets.append(tweet)
print(listOfTweets)

Re-organizing the data in a text file in python3

I have a text file which looks like the small example:
small example:
Name sample1 sample2 sample3
A2M 9805.6 3646.8 1376.48
ACVR1C 20 37.8 20
ADAM12 197.8 120.96 31.28
I am trying to re-organize the data and make a new text file which looks like the expected output:
expected output:
Name Sample
A2M 9805.6
A2M 3646.8
A2M 1376.48
ACVR1C 20
ACVR1C 37.8
ACVR1C 20
ADAM12 197.8
ADAM12 120.96
ADAM12 31.28
in fact the last 3 columns (of input data) will be included in the 2nd column of output data and every item in the 1st column of input file will be repeated 3 times (there are 3 samples per Name).
to do so, I wrote the following code in python3:
def convert(input_file, output_file):
with open(input_file, 'r') as infile:
res = {}
line = infile.split()
res.keys = line[0]
res.values = line[2:]
outfile = open(output_file, "w")
for k, v in res.items():
outfile.write(str(k) + '\t'+ str(v) + '\n')
but it does not return what I want to get. do you know how to fix it?
You have a few problems in your code.
First you should also open the outfile within the with statement. Second, a dict's keys and values are read only. And last you try to split the whole file which is not possible. You want to loop on all the lines like so:
def convert(input_file, output_file):
with open(input_file) as infile, open(output_file, "w") as outfile:
outfile.write("Name\tSample")
for line in infile:
values = line.split()
for value in values[1:]:
outfile.write(values[0] + "\t" + value + "\n")
Although you should consider changing your format to csv and reading it to a dataframe.
Try this,
d= {}
with open('file1.txt','r') as f: # Your file
header = next(f)
for i in f:
d.setdefault(i.split()[0],[]).extend(i.split()[1:])
with open('nflie1.txt','w') as f: # New file
f.write('Name Sample\n')
for k,v in d.items():
for el in v:
f.write('{} {}\n'.format(k,el))
Output:
Name Sample
A2M 9805.6
A2M 3646.8
A2M 1376.48
ACVR1C 20
ACVR1C 37.8
ACVR1C 20
ADAM12 197.8
ADAM12 120.96
ADAM12 31.28

How to calculate from a dictionary in python

import operator
with open("D://program.txt") as f:
Results = {}
for line in f:
part_one,part_two = line.split()
Results[part_one] = part_two
c=sum(int(Results[x]) for x in Results)
r=c/12
d=len(Results)
F=max(Results.items(), key=operator.itemgetter(1))[0]
u=min(Results.items(), key=operator.itemgetter(1))[0]
print ("Number of entries are",d)
print ("Student with HIGHEST mark is",F)
print ("Student with LOWEST mark is",u)
print ("Avarage mark is",r)
Results = [ (v,k) for k,v in Results.items() ]
Results.sort(reverse=True)
for v,k in Results:
print(k,v)
import sys
orig_stdout = sys.stdout
f = open('D://programssr.txt', 'w')
sys.stdout = f
print ('Number of entries are',d)
print ("Student with HIGHEST mark is",F)
print ("Student with LOWEST mark is",u)
print ("Avarage mark is",r)
for v,k in Results:
print(k,v)
sys.stdout = orig_stdout
f.close()
I want to read a txt file but problem is it cant compute the results i want to write in a new file because of the NAMES and MARKS in file.if you remove them it works fine.i want to make calculations without removing NAMES and MARKS in txt file..Help what i am i doing wrong
NAMES MARKS
Lux 95
Veron 70
Lesley 88
Sticks 80
Tipsey 40
Joe 62
Goms 18
Wesley 35
Villa 11
Dentist 72
Onty 50
Just consume the first line using next() function, before looping over it:
with open("D://program.txt") as f:
Results = {}
next(f)
for line in f:
part_one,part_two = line.split()
Results[part_one] = part_two
Note that file objects are iterator-like object (one shot iterable) and when you loop over them you consume the items and you have no access to them anymore.

Resources