python 3 tab-delimited file adds column file.write - python-3.x

I'm writing string entries to a tab-delimited file in Python 3. The code that I use to save the content is:
savedir = easygui.diropenbox()
savefile = input("Please type the filename (including extension): ")
file = open(os.path.join(savedir, savefile), "w", encoding="utf-8")
file.write("Number of entities not found: " + str(missing_count) + "\n")
sep = "\t"
for entry in entities:
file.write(entry[0]+"\t")
for item in entry:
file.write(sep.join(item[0]))
file.write("\t")
file.write("\n")
file.close()
The file saves properly. There are no errors or warnings sent to the terminal. When I open the file, I find an extra column has been saved to the file.
Query | Extra | Name
Abu-Jamal, Mumia | A | Mumia Abu-Jamal
Anderson, Walter | A | Walter Inglis Anderson
Anderson, Walter | A | Walter Inglis Anderson
I've added vertical bars between the tabs for clarity; they don't normally appear there. As well, I have removed a few columns at the end. The column between the vertical bars is not supposed to be there. The document that is saved to file is longer than three lines. On each line, the extra column is the first letter of the Query column. Hence, we have A's in these three examples.
entry[0] corresponds exactly to the value in the Query column.
sep.join(item[0]) corresponds exactly to columns 3+.
Any idea why I would be getting this extra column?
Edit: I'm adding the full code for this short script.
# =============================================================================
# Code to query DBpedia for named entities.
#
# =============================================================================
import requests
import xml.etree.ElementTree as et
import csv
import os
import easygui
import re
# =============================================================================
# Default return type is XML. Others: json.
# Classes are: Resource (general), Place, Person, Work, Species, Organization
# but don't include resource as one of the
# =============================================================================
def urlBuilder(query, queryClass="unknown", returns=10):
prefix = 'http://lookup.dbpedia.org/api/search/KeywordSearch?'
#Selects the appropriate QueryClass for the url
if queryClass == 'place':
qClass = 'QueryClass=place'
elif queryClass == 'person':
qClass = 'QueryClass=person'
elif queryClass == 'org':
qClass = 'QueryClass=organization'
else:
qClass = 'QueryClass='
#Sets the QueryString
qString = "QueryString=" + str(query)
#sets the number of returns
qHits = "MaxHits=" + str(returns)
#full url
dbpURL = prefix + qClass + "&" + qString + "&" + qHits
return dbpURL
#takes a xml doc as STRING and returns an array with the name and the URI
def getdbpRecord(xmlpath):
root = et.fromstring(xmlpath)
dbpRecord = []
for child in root:
temp = []
temp.append(child[0].text)
temp.append(child[1].text)
if child[2].text is None:
temp.append("Empty")
else:
temp.append(findDates(child[2].text))
dbpRecord.append(temp)
return dbpRecord
#looks for a date with pattern: 1900-01-01 OR 01 January 1900 OR 1 January 1900
def findDates(x):
pattern = re.compile('\d{4}-\d{2}-\d{2}|\d{2}\s\w{3,9}\s\d{4}|\d{1}\s\w{3,9}\s\d{4}')
returns = pattern.findall(x)
if len(returns) > 0:
return ";".join(returns)
else:
return "None"
#%%
# =============================================================================
# Build and send get requests
# =============================================================================
print("Please select the CSV file that contains your data.")
csvfilename = easygui.fileopenbox("Please select the CSV file that contains your data.")
lookups = []
name_list = csv.reader(open(csvfilename, newline=''), delimiter=",")
for name in name_list:
lookups.append(name)
#request to get the max number of returns from the user.
temp = input("Specify the maximum number of returns desired: ")
if temp.isdigit():
maxHits = temp
else:
maxHits = 10
queries = []
print("Building queries. Please wait.")
for search in lookups:
if len(search) == 2:
queries.append([search[0], urlBuilder(query=search[0], queryClass=search[1], returns=maxHits)])
else:
queries.append([search, urlBuilder(query=search, returns=maxHits)])
responses = []
print("Gathering responses. Please wait.")
for item in queries:
response = requests.get(item[1])
data = response.content.decode("utf-8")
responses.append([item[0], data])
entities = []
missing_count = 0
for item in responses:
temp = []
if len(list(et.fromstring(item[1]))) > 0:
entities.append([item[0], getdbpRecord(item[1])])
else:
missing_count += 1
print("There are " + str(missing_count) + " entities that were not found.")
print("Please select the destination folder for the results of the VIAF lookup.")
savedir = easygui.diropenbox("Please select the destination folder for the results of the VIAF lookup.")
savefile = input("Please type the filename (including extension): ")
file = open(os.path.join(savedir, savefile), "w", encoding="utf-8")
file.write("Number of entities not found: " + str(missing_count) + "\n")
sep = "\t"
for entry in entities:
file.write(entry[0]+"\t")
for item in entry:
file.write(sep.join(item[0]))
file.write("\t")
file.write("\n")
file.close()

Related

Delete multiple Column(headers) in csv in python

Here is my code
import csv
#import pandas as pd
f = open("stu.csv", "a+", newline="")
Smain = csv.writer(f)
Smain.writerow(["Name", "Father Name", "Class", "Admission Number"])
mainrec = []
increas = 1
class1 = open("class-1.csv", "a+", newline="")
stuclas1 = csv.writer(class1)
stuclas1.writerow(["Roll Number", "Name", "Admission Number"])
while True:
nam = input("Enter Student Name - ")
Clas = int(input("Enter Class - "))
Fname = input("Enter Father Name - ")
adm = 100 + increas
lst = [nam, Fname, Clas, adm]
mainrec.append(lst)
if Clas == 1:
stucls1list = []
a1 = 0
rollnum_cla1 = 0 + increas + a1
a1 = 0 + rollnum_cla1
lst1 = [rollnum_cla1, nam, adm]
stucls1list.append(lst1)
for i1 in stucls1list:
stuclas1.writerow(i1)
increas += 1
c = input("Input 'Y' If You Want To Record More, Otherwise Press 'N' - ")
if c == "N":
break
for i in mainrec:
Smain.writerow(i)
# load dataset
#df = pd.read_csv("stu.csv")
# select the rows
# if Name column
# has special characters
#print(df[df.Name.str.contains(r'[Name]')])
# drop the merged selected rows
f.close()
class1.close()
Can anyone tell me how i can delete duplicate header
when i run this program more than 2 time it start duplication header and i dont want to duplicate header again and again please tell me how i can delete that header(column)
stu csv when
i run this program 2 times
Name,Father Name,Class,Admission Number
xyz,xyz,1,101
qwe,qwe,1,102
N,N,1,103
Name,Father Name,Class,Admission Number # this i want to delete
test,test,1,101
you,you,1,102
Here's a way to solve for it.
f = open("stu.csv", "a+", newline="")
f.seek(0)
if len(f.read()) == 0
Smain = csv.writer(f)
Smain.writerow(["Name", "Father Name", "Class", "Admission Number"])
This will check if there is content inside the file. If it has contents, then you already have the header. If there is no content, then you dont have a header. This will determine if you need to write the header to the file.
Here's the sample output when I tried:
First run:
No header #my print statement output
Output file after first run: #the file header written to file
Name,Father Name,Class,Admission Number
Second run:
header found, skipped writing header again #my print statement output
It did not write the output to file again.
Output file on second run:
Name,Father Name,Class,Admission Number
Note here that f.write will move the pointer to the last line by default (since the file is opened in a+ mode)

pd.rename key KeyError: 'New_Name'

Edit 12/07/19: The problem was not in fact with pd.rename fuction but the fact that I did not return from the function the pandas dataframe and as a result the column change did not exist when printing. i.e.
def change_column_names(as_pandas, old_name, new_name):
as_pandas.rename(columns={old_name: new_name}, inplace=)
return as_pandas <- This was missing*
Please see the user comment below to uptick them for finding this error for me.
Alternatively, you can continue reading.
The data can be downloaded from this link, yet I have added a sample dataset. The formatting of the file is not a typical CSV file and I believe this may have been an assessment piece and is related to Hidden Decision Tree article. I have given the portion of the code as it solves the issues surrounding the format of the text file as mentioned above and allows the user to rename the column.
The problem occured when I tried to assign create a re-naming function:
def change_column_names(as_pandas, old_name, new_name):
as_pandas.rename(columns={old_name: new_name}, inplace=)
However, it seem to work when I set the variable names inside rename function.
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'}, inplace=True)
return as_pandas
Sample Dataset
Title URL Date Unique Pageviews
oupUrl=tutorials 18-Apr-15 5608
"An Exclusive Interview with Data Expert, John Bottega" http://www.datasciencecentral.com/forum/topics/an-exclusive-interview-with-data-expert-john-bottega?groupUrl=announcements 10-Jun-14 360
Announcing Composable Analytics http://www.datasciencecentral.com/forum/topics/announcing-composable-analytics 15-Jun-14 367
Announcing the release of Spark 1.5 http://www.datasciencecentral.com/forum/topics/announcing-the-release-of-spark-1-5 12-Sep-15 156
Are Extreme Weather Events More Frequent? The Data Science Answer http://www.datasciencecentral.com/forum/topics/are-extreme-weather-events-more-frequent-the-data-science-answer 5-Oct-15 204
Are you interested in joining the University of California for an empiricalstudy on 'Big Data'? http://www.datasciencecentral.com/forum/topics/are-you-interested-in-joining-the-university-of-california-for-an 7-Feb-13 204
Are you smart enough to work at Google? http://www.datasciencecentral.com/forum/topics/are-you-smart-enough-to-work-at-google 11-Oct-15 3625
"As a software engineer, what's the best skill set to have for the next 5-10years?" http://www.datasciencecentral.com/forum/topics/as-a-software-engineer-what-s-the-best-skill-set-to-have-for-the- 12-Feb-16 2815
A Statistician's View on Big Data and Data Science (Updated) http://www.datasciencecentral.com/forum/topics/a-statistician-s-view-on-big-data-and-data-science-updated-1 21-May-14 163
A synthetic variance designed for Hadoop and big data http://www.datasciencecentral.com/forum/topics/a-synthetic-variance-designed-for-hadoop-and-big-data?groupUrl=research 26-May-14 575
A Tough Calculus Question http://www.datasciencecentral.com/forum/topics/a-tough-calculus-question 10-Feb-16 937
Attribution Modeling: Key Analytical Strategy to Boost Marketing ROI http://www.datasciencecentral.com/forum/topics/attribution-modeling-key-concept 24-Oct-15 937
Audience expansion http://www.datasciencecentral.com/forum/topics/audience-expansion 6-May-13 223
Automatic use of insights http://www.datasciencecentral.com/forum/topics/automatic-use-of-insights 27-Aug-15 122
Average length of dissertations by higher education discipline. http://www.datasciencecentral.com/forum/topics/average-length-of-dissertations-by-higher-education-discipline 4-Jun-15 1303
This is the full code that produces the Key Error:
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'}, inplace=True)
def change_column_names(as_pandas, old_name, new_name):
as_pandas.rename(columns={old_name: new_name}, inplace=True)
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'},
inplace=True)
def open_as_dataframe(file_name_in):
reader = pd.read_csv(file_name_in, encoding='windows-1251')
return reader
# Get each column of data including the heading and separate each element
i.e. Title, URL, Date, Page Views
# and save to string_of_rows with comma separator for storage as a csv
# file.
def get_columns_of_data(*args):
# Function that accept variable length arguments
string_of_rows = str()
num_cols = len(args)
try:
if num_cols > 0:
for number, element in enumerate(args):
if number == (num_cols - 1):
string_of_rows = string_of_rows + element + '\n'
else:
string_of_rows = string_of_rows + element + ','
except UnboundLocalError:
print('Empty file \'or\' No arguments received, cannot be zero')
return string_of_rows
def open_file(file_name):
try:
with open(file_name) as csv_file_in, open('HDT_data5.txt', 'w') as csv_file_out:
csv_read = csv.reader(csv_file_in, delimiter='\t')
for row in csv_read:
try:
row[0] = row[0].replace(',', '')
csv_file_out.write(get_columns_of_data(*row))
except TypeError:
continue
print("The file name '{}' was successfully opened and read".format(file_name))
except IOError:
print('File not found \'OR\' Not in current directory\n')
# All acronyms used in variable naming correspond to the function at time
# of return from function.
# csv_list being a list of the v file contents the remainder i.e. 'st' of
# csv_list_st = split_title().
def main():
open_file('HDTdata3.txt')
multi_sets = open_as_dataframe('HDT_data5.txt')
# change_column_names(multi_sets)
change_column_names(multi_set, 'Old_Name', 'New_Name')
print(multi_sets)
main()
I cleaned up your code so it would run. You were changing the column names but not returning the result. Try the following:
import pandas as pd
import numpy as np
import math
def set_new_columns(as_pandas):
titles_list = ['Year > 2014', 'Forum', 'Blog', 'Python', 'R',
'Machine_Learning', 'Data_Science', 'Data',
'Analytics']
for number, word in enumerate(titles_list):
as_pandas.insert(len(as_pandas.columns), titles_list[number], 0)
def title_length(as_pandas):
# Insert new column header then count the number of letters in 'Title'
as_pandas.insert(len(as_pandas.columns), 'Title_Length', 0)
as_pandas['Title_Length'] = as_pandas['Title'].map(str).apply(len)
# Although it is log, percentage of change is inverse linear comparison of
#logX1 - logX2
# therefore you could think of it as the percentage change in Page Views
# map
# function allows for function to be performed on all rows in column
# 'Page_Views'.
def log_page_view(as_pandas):
# Insert new column header
as_pandas.insert(len(as_pandas.columns), 'Log_Page_Views', 0)
as_pandas['Log_Page_Views'] = as_pandas['Page_Views'].map(lambda x: math.log(1 + float(x)))
def change_to_numeric(as_pandas):
# Check for missing values then convert the column to numeric.
as_pandas = as_pandas.replace(r'^\s*$', np.nan, regex=True)
as_pandas['Page_Views'] = pd.to_numeric(as_pandas['Page_Views'],
errors='coerce')
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'}, inplace=True)
return as_pandas
def open_as_dataframe(file_name_in):
reader = pd.read_csv(file_name_in, encoding='windows-1251')
return reader
# Get each column of data including the heading and separate each element
# i.e. Title, URL, Date, Page Views
# and save to string_of_rows with comma separator for storage as a csv
# file.
def get_columns_of_data(*args):
# Function that accept variable length arguments
string_of_rows = str()
num_cols = len(args)
try:
if num_cols > 0:
for number, element in enumerate(args):
if number == (num_cols - 1):
string_of_rows = string_of_rows + element + '\n'
else:
string_of_rows = string_of_rows + element + ','
except UnboundLocalError:
print('Empty file \'or\' No arguments received, cannot be zero')
return string_of_rows
def open_file(file_name):
import csv
try:
with open(file_name) as csv_file_in, open('HDT_data5.txt', 'w') as csv_file_out:
csv_read = csv.reader(csv_file_in, delimiter='\t')
for row in csv_read:
try:
row[0] = row[0].replace(',', '')
csv_file_out.write(get_columns_of_data(*row))
except TypeError:
continue
print("The file name '{}' was successfully opened and read".format(file_name))
except IOError:
print('File not found \'OR\' Not in current directory\n')
# All acronyms used in variable naming correspond to the function at time
# of return from function.
# csv_list being a list of the v file contents the remainder i.e. 'st' of
# csv_list_st = split_title().
def main():
open_file('HDTdata3.txt')
multi_sets = open_as_dataframe('HDT_data5.txt')
multi_sets = change_column_names(multi_sets)
change_to_numeric(multi_sets)
log_page_view(multi_sets)
title_length(multi_sets)
set_new_columns(multi_sets)
print(multi_sets)
main()

How to update the contents of a file which consists of Headers in the first line and the values corresponding to it in the corresponding lines

I have a file with below contents:
pid int| name varchar(20)| price float
1 |Giga. |10.99
2. |PowerGiga. |29.99
I want to replace Giga with Mega in the file where the column is Name
and replace price column <15 with 13.99
I have just written the contents given by the user input to a file. Its not stored with any mappings. How do you I replace the name in the file?
Expected:
pid int| name varchar(20)| price float
1 |Mega. |13.99
2. |PowerGiga. |29.99
I have tried this with python as below .
Whats happening is, my entire file content is getting erased
import sys,re
import os
mypath="/Users/rk/Documents/code/PA1/"
db_used="CS457_PA2"
def updateTable():
if os.path.isdir(mypath+db_used):
filepath= mypath+db_used+"/"+filename+".txt"
if not os.path.isfile(filepath): #check if file not exists is true
print("!Failed to insert into table "+ filename +" because it does not exist.")
else :
Column1=List[3]
Column2=List[6]
value1=List[4]
value2=List[7]
newfile=open(filepath, "r")
for w in newfile:
list= w.split('|')
if value1 in list:
print(list)
a=1
print("yes")
newfile=open(filepath, "w")
for a in list:
if value1 in list[0:]:
newfile.write(a.replace(value1,value2))
print("check the file if its updated")
else:
print("nothing")
else :
db_used == " "
print("DB is not selected")
user_says = input().strip()
if "UPDATE" in user_says.upper():
temp=user_says.strip(";")
removespecial=re.sub("\W+", " ", temp) #removes special characters but preserves space
List=removespecial.split(" ")
filename=List[1]
updateTable()
else:
print("Debug")
I tried the below code and it worked for me.
with open(filepath, "rt") as fin:
with open(out, "wt") as fout:
for line in fin:
fout.write(line.replace(value2, value1))
os.remove(filepath)
os.rename(out,filepath)

Never resets list

I am trying to create a calorie counter the standard input goes like this:
python3 calories.txt < test.txt
Inside calories the food is the following format: apples 500
The problem I am having is that whenever I calculate the values for the person it seems to never return to an empty list..
import sys
food = {}
eaten = {}
finished = {}
total = 0
#mappings
def calories(x):
with open(x,"r") as file:
for line in file:
lines = line.strip().split()
key = " ".join(lines[0:-1])
value = lines[-1]
food[key] = value
def calculate(x):
a = []
for keys,values in x.items():
for c in values:
try:
a.append(int(food[c]))
except:
a.append(100)
print("before",a)
a = []
total = sum(a) # Problem here
print("after",a)
print(total)
def main():
calories(sys.argv[1])
for line in sys.stdin:
lines = line.strip().split(',')
for c in lines:
values = lines[0]
keys = lines[1:]
eaten[values] = keys
calculate(eaten)
if __name__ == '__main__':
main()
Edit - forgot to include what test.txt would look like:
joe,almonds,almonds,blue cheese,cabbage,mayonnaise,cherry pie,cola
mary,apple pie,avocado,broccoli,butter,danish pastry,lettuce,apple
sandy,zuchini,yogurt,veal,tuna,taco,pumpkin pie,macadamia nuts,brazil nuts
trudy,waffles,waffles,waffles,chicken noodle soup,chocolate chip cookie
How to make it easier on yourself:
When reading the calories-data, convert the calories to int() asap, no need to do it every time you want to sum up somthing that way.
Dictionary has a .get(key, defaultvalue) accessor, so if food not found, use 100 as default is a 1-liner w/o try: ... except:
This works for me, not using sys.stdin but supplying the second file as file as well instead of piping it into the program using <.
I modified some parsings to remove whitespaces and return a [(name,cal),...] tuplelist from calc.
May it help you to fix it to your liking:
def calories(x):
with open(x,"r") as file:
for line in file:
lines = line.strip().split()
key = " ".join(lines[0:-1])
value = lines[-1].strip() # ensure no whitespaces in
food[key] = int(value)
def getCal(foodlist, defValueUnknown = 100):
"""Get sum / total calories of a list of ingredients, unknown cost 100."""
return sum( food.get(x,defValueUnknown ) for x in foodlist) # calculate it, if unknown assume 100
def calculate(x):
a = []
for name,foods in x.items():
a.append((name, getCal(foods))) # append as tuple to list for all names/foods eaten
return a
def main():
calories(sys.argv[1])
with open(sys.argv[2]) as f: # parse as file, not piped in via sys.stdin
for line in f:
lines = line.strip().split(',')
for c in lines:
values = lines[0].strip()
keys = [x.strip() for x in lines[1:]] # ensure no whitespaces in
eaten[values] = keys
calced = calculate(eaten) # calculate after all are read into the dict
print (calced)
Output:
[('joe', 1400), ('mary', 1400), ('sandy', 1600), ('trudy', 1000)]
Using sys.stdin and piping just lead to my console blinking and waiting for manual input - maybe VS related...

How to merge two lists at a delimited token in python3

I am a CS major at the University of Alabama, we have a project in our python class and I am stuck...probably for some stupid reason, but I cant seem to find the answer.
here is the link to the project, as it would be a pain to try and explain on here.
http://beastie.cs.ua.edu/cs150/projects/project1.html
here is my code:
import sys
from scanner import scan
def clInput():
#Gets command line input
log1 = sys.argv[1]
log2 = sys.argv[2]
name = sys.argv[3]
if len(sys.argv) != 4:
print('Incorrect number of arguments, should be 3')
sys.exit(1)
return log1,log2,name
def openFiles(log1,log2):
#Opens sys.argv[1]&[2] for reading
f1 = open(log1, 'r')
f2 = open(log2, 'r')
return f1, f2
def merge(log1,log2):
#Merges parsed logs into list without '---'
log1Parse = [[]]
log2Parse = [[]]
log1Count = 0
log2Count = 0
for i in log1:
if i != ['---']:
log1Parse[log1Count].append(i)
else:
log1Count += 1
log1Parse.append([])
for i in log2:
if i != ['---']:
log2Parse[log2Count].append(i)
else:
log2Count += 1
log2Parse.append([])
return(log1Parse[0] + log2Parse[0] + log1Parse[1] + log2Parse[1])
def searchMerge(name,merged):
#Searches Merged list for sys.argv[3]
for i in range(len(merged)):
if (merged[i][1] == name):
print(merged[i][0],merged[i][1]," ".join(merged[i][2:]))
def main():
log1,log2,name = clInput()
f1,f2 = openFiles(log1,log2)
#Sets the contents of the two scanned files to variables
tokens1 = scan(f1)
tokens2 = scan(f2)
#Call to merge and search
merged = merge(tokens1,tokens2)
searchMerge(name,merged)
main()
ok. so heres the problem. We are to merge two lists together into a sorted master list, delimited at the ---'s
my two log files match the ones posted on the website i linked to above. This code works, however if there are more than two instances of the ---'s in each list, it will not jump to the next list to get the other tokens, and so forth. I have it working for two with the merge function. at the end of that function i return
return(log1Parse[0] + log2Parse[0] + log1Parse[1] + log2Parse[1])
but this only works for two instances of ---. Is there anyway i can change my return to look at all of the indexes instead of having to manually put in [0],[1],[2], etc.? I need it to delimit and merge for an arbitrary amount. Please help!!
p.s. disregard the noobness...im a novice, we all gotta start somewhere
p.p.s. - the from scanner import scan is a scanner i wrote to take in all of the tokens in a given list
so.py:
import sys
def main():
# check and load command line arguments
# your code
if len(sys.argv) != 4:
print('Incorrect number of arguments, should be 3')
sys.exit(1)
# open files using file io
# your code
f1 = open(log1, 'r')
f2 = open(log2, 'r')
# list comprehension to process and filter log files
l1 = [ x.strip().split(" ",2) for x in f1.readlines() if x.strip() != "---" ]
l2 = [ x.strip().split(" ",2) for x in f2.readlines() if x.strip() != "---" ]
f1.close()
f2.close()
sorted_merged_lists = sorted(l1 + l2)
results = [ x for x in sorted_merged_lists if x[1] == name ]
for result in results:
print result
main()
CLI:
$ python so.py log1.txt log2.txt Matt
['12:06:12', 'Matt', 'Logged In']
['13:30:07', 'Matt', 'Opened Terminal']
['15:02:00', 'Matt', 'Opened Evolution']
['15:31:16', 'Matt', 'Logged Out']
docs:
http://docs.python.org/release/3.0.1/tutorial/datastructures.html#list-comprehensions
http://docs.python.org/release/3.0.1/library/stdtypes.html?highlight=strip#str.strip
http://docs.python.org/release/3.0.1/library/stdtypes.html?highlight=split#str.split
http://docs.python.org/release/3.0.1/library/functions.html?highlight=sorted#sorted

Resources