Formatting a Python generated CSV - python-3.x

I'm making a web scraper in python.
I'd like to remove the blank rows from the generated csv and would like to add a header saying "Car make", "Car Model", "Price". and would also like to remove the [] from all the names in the generated csv
imports go here...
source = requests.get(' website link goes here...').text
soup = bs(source, 'html.parser')
csv_file = open('pyScraper_1.3_Export', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['brand_Names', 'Prices'])
csv_file.close()
#gives us the make and model of all cars
Names = []
Prices_Cars = []
for var1 in soup.find_all('h3', class_ = 'brandModelTitle'):
car_Names = var1.text # var1.span.text
test_Split = car_Names.split("\n")
full_Names = test_Split[1:3]
#make = test_Split[1:2]
#model = test_Split[2:3]
Names.append(full_Names)
#prices
for Prices in soup.find_all('span', class_ = 'f20 bold fieldPrice'):
Prices = Prices.span.text
Prices = re.sub("^\s+|\s+$", "",Prices, flags=re.UNICODE) # removing whitespace before the prices
Prices_Cars.append(Prices)
csv_file = open('pyScraper_1.3_Export.csv', 'a')
csv_writer = csv.writer(csv_file)
i = 0
while i < len(Prices_Cars):
csv_writer.writerow([Names[i], Prices_Cars[i]])
i = i + 1
csv_file.close()
here is the screenshot of the generated csv
![][1]
[1]: https://i.stack.imgur.com/m7Xw1.jpg

To remove additional newlines:
csv_file = open('pyScraper_1.3_Export.csv', 'a', newline='')
("If csvfile is a file object, it should be opened with newline=''.", https://docs.python.org/3/library/csv.html#csv.writer)
To add headers:
you are actually adding headers, but to file named pyScraper_1.3_Export (note no .csv extension), this may be a mistype. Just change the code at about line 6 to
csv_file = open('pyScraper_1.3_Export.csv', 'w', newline='')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["Car make", "Car Model", "Price"])
csv_file.close()
As for removing nested list, unpack Names[i] with * operator:
csv_writer.writerow([*Names[i], Prices_Cars[i]])

Related

.csv to .arff function on Python

I'm trying to do a convertion function from csv to arff, right now I have this:
def csv2arff(csv_path, arff_path=None):
with open(csv_path, 'r') as fr:
attributes = []
if arff_path is None:
arff_path = csv_path[:-4] + '_prueba.arff' # *.arff -> *.csv
write_sw = False
with open(arff_path, 'w') as fw:
fw.write('#relation base_datos_modelo_3_limpia \n')
firstline = fr.readlines()[0].rstrip()
fw.write(firstline)
and that gives me:
#relation base_datos_modelo_3_limpia
DVJ_Valgus_KneeMedialDisplacement_D_discr,BMI,AgeGroup,ROM-PADF-KE_D,DVJ_Valgus_FPPA_D_discr,TrainFrequency,DVJ_Valgus_FPPA_ND_discr,Asym_SLCMJLanding-pVGRF(10percent)_discr,Asym-ROM-PHIR(≥8)_discr,Asym_TJ_Valgus_FPPA(10percent)_discr,TJ_Valgus_FPPA_ND_discr,Asym-ROM-PHF-KE(≥8)_discr,TJ_Valgus_FPPA_D_discr,Asym_SLCMJ-Height(10percent)_discr,Asym_YBTpl(10percent)_discr,Position,Asym-ROM-PADF-KE(≥8º)_discr,DVJ_Valgus_KneeMedialDisplacement_ND_discr,DVJ_Valgus_Knee-to-ankle-ratio_discr,Asym-ROM-PKF(≥8)_discr,Asym-ROM-PHABD(≥8)_discr,Asym-ROM-PHF-KF(≥8)_discr,Asym-ROM-PHER(≥8)_discr,AsymYBTanterior10percentdiscr,Asym-ROM-PHABD-HF(≥8)_discr,Asym-ROM-PHE(≥8)_discr,Asym(>4cm)-DVJ_Valgus_Knee;edialDisplacement_discr,Asym_SLCMJTakeOff-pVGRF(10percent)_discr,Asym-ROM-PHADD(≥8)_discr,Asym-YBTcomposite(10percent)_discr,Asym_SingleHop(10percent)_discr,Asym_YBTpm(10percent)_discr,Asym_DVJ_Valgus_FPPA(10percent)_discr,Asym_SLCMJ-pLFT(10percent)_discr,DominantLeg,Asym-ROM-PADF-KF(≥8)_discr,ROM-PHER_ND,CPRDmentalskills,POMStension,STAI-R,ROM-PHER_D,ROM-PHIR_D,ROM-PADF-KF_ND,ROM-PADF-KF_D,Age_at_PHV,ROM-PHIR_ND,CPRDtcohesion,Eperience,ROM-PHABD-HF_D,MaturityOffset,Weight,ROM-PHADD_ND,Height,ROM-PHADD_D,Age,POMSdepressio,ROM-PADF-KE_ND,POMSanger,YBTanterior_Dnorm,YBTanterior_NDnorm,POMSvigour,Soft-Tissue_injury_≥4days
So i want to put "#attribute" before each attribute and change the "," to "\n". But don't know how to do it, I tried to make a function to change the "," but didn't work, any idea?
Thank you guys.
Try the liac-arff library.
Here is an example for converting the UCI iris dataset from ARFF to CSV and then back to ARFF:
import csv
import arff
# arff -> csv
content = arff.load(open('./iris.arff', 'r'))
with open('./out.csv', 'w') as fp:
writer = csv.writer(fp)
header = []
for n, t in content['attributes']:
header.append(n)
writer.writerow(header)
writer.writerows(content['data'])
# csv -> arff
with open('./out.csv', 'r') as fp:
reader = csv.reader(fp)
header = None
data = []
for row in reader:
if header is None:
header = row
else:
data.append(row)
content = {}
content['relation'] = "from my csv file"
content['attributes'] = []
for n in header:
if n == "class":
content['attributes'].append((n, ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']))
else:
content['attributes'].append((n, 'NUMERIC'))
content['data'] = data
with open('./out.arff', 'w') as fp:
arff.dump(content, fp)
NB: For the last stage, we need to specify the nominal class values, which you could determine by scanning the data.

Printing data from the csv file

I wanna ask how do I print the number of dentists respective in 2012 from the CSV file.
import csv
csv_file = open('project.csv' , 'r')
data_file = csv.reader(csv_file)
def opt_a():
next(data_file)
for row in data_file:
i = row[1:]
print(i)
opt_a()
This what I have tried but I will only be able to print out in rows.
Sample of data :
year,Private Dental Specialists,Private General Dental Practitioners,Public Dental Specialists,Public General Dental Practitioners
2008,116,864,47,268
2009,180,863,74,246
2010,185,874,87,267
2011,199,961,77,241
2012,203,1012,86,271
2013,219,1192,88,308
2014,216,1219,96,348
2015,215,1326,102,347
2016,219,1425,106,380
2017,232,1516,112,365
2018,189,1579,113,412
2019,237,1644,130,379
In your for loop, simply check if the first value of your row is 2012 :
import csv
csv_file = open('project.csv' , 'r')
data_file = csv.reader(csv_file)
def opt_a():
next(data_file)
for row in data_file:
if row[0] == "2012":
i = row[1:]
print(i)
opt_a()

Error "found unreadable content" when open excel file written by .to_excel()

I'm practicing web scraping, and try to get some info of cars.
I want one of my dataframe columns be a hyperlink to the model image, and I succeeded in creating the formula and saving it to dataframe
For example:
print(suvs['images'][0])
=HYPERLINK("https://www.cstatic-images.com/car-pictures/main/USD00AUS052A021001.png"; "Audi Q3")
print(suvs['images'][3])
=HYPERLINK("https://www.cstatic-images.com/car-pictures/main/USD00AUS011B021001.png"; "Audi Q7")
, but when opening the file (I'm using Excel 2010), it throws me that error "Excel found unreadable content...", and if I responded with Yes, it deletes the column contents.
Here is log:
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<recoveryLog xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"><logFileName>error040320_03.xml</logFileName><summary>Errors were detected in file 'D:\Users\Tamer\PycharmProjects\Udemy\suvs.xlsx'</summary><removedRecords summary="Following is a list of removed records:"><removedRecord>Removed Records: Formula from /xl/worksheets/sheet1.xml part</removedRecord></removedRecords></recoveryLog>
I tried to copy and paste the link generated by the code into the file, and it works very well.
So what is the problem?
Thank you
Here's the code:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
try:
HEADERS = {"Accept-Language": "en-US,en;q=0.5"}
URL = 'https://www.cars.com/research/audi/'
images = []
models = []
model_years = []
msrp_ranges = []
consumer_ratings = []
combined_mpgs = []
body_styles = []
page = requests.get(URL, headers=HEADERS)
data = BeautifulSoup(page.text, 'html.parser')
cards = data.find_all('div', class_='card make-model-card suvs')
for card in cards:
model = card.p.text
models.append(model)
image = card.img['data-image-src']
# Creating hyperlinks to save it in suvs.xlsx file
hyperlink = f'=HYPERLINK("{image}"; "{model}")'
print(hyperlink)
images.append(hyperlink)
# images.append(image)
# Get 'More Details' page URL
details_url = 'https://www.cars.com' + card.div.a.attrs['href']
details_page = requests.get(details_url, headers=HEADERS)
details_data = BeautifulSoup(details_page.text, 'html.parser')
details = details_data.find_all('div', class_='expanded-list-contents')
model_year = details_data.h3.text
model_years.append(model_year)
if details_data.find('li', class_='expanded-list-item'):
msrp_range = details_data.find('li', class_='expanded-list-item').text.replace('MSRP Range\n', '').strip()
else:
msrp_range = ''
if details_data.find('td', class_='expanded-list-item'):
msrp_range = details_data.find('li', class_='expanded-list-item').text.replace('MSRP Range\n', '').strip()
else:
msrp_range = ''
msrp_ranges.append(msrp_range)
if details_data.find('div', class_='star-rating__consumer-review'):
consumer_rating = details_data.find('div', class_='star-rating__consumer-review').text.replace('\n', '').strip()
consumer_rating = consumer_rating[:5] + ' ' + consumer_rating[-15:]
else:
consumer_rating = 'Not rated yet'
consumer_ratings.append(consumer_rating)
# Creating dataframe "homes"
suvs = pd.DataFrame({
'images': images,
'models': models,
'year': model_years,
'msrp_ranges': msrp_ranges,
'consumer_ratings': consumer_ratings,
# 'combined_mpgs': combined_mpgs,
# 'body_styles': body_styles
})
file_name = 'suvs.xlsx'
suvs.to_excel(file_name, index=False) # index=False to avoid creating index column
# to see where you're missing data and how much data is missing
print(suvs.isnull().sum())
except requests.exceptions.ConnectionError:
print("Couldn't connect to the requested page!\n"
'Please, check your internet connection.')
except PermissionError:
print(f'The {file_name} file is opened.\n'
'Please, close it, and run the script again.')

How to add headers when creating csv file in python

I am getting "raw" data from a csv file, and putting only what I need for a new csv file that will be used to auto add users to a different system...
I am unsure how to add the correct headers needed for the file.
I've tried looking at other examples of adding headers but have not figured this out yet...
The headers I need to add are as follows:
"ID Card Number","Name","E-Mail","User Level","Position","Status","Covered Under Insurance","Paid Insurance"
(and in that order)
import csv
def studentscsv():
with open('..\StudentEmails_and_StudentNumbers.csv') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
with open('mydirectory\student_users.csv', mode='w', newline='') as output_file:
write = csv.writer(output_file, delimiter=',', quoting=csv.QUOTE_MINIMAL)
for row in csv_reader:
a = row[0]
studentnumber = row[1]
firstname = row[2]
lastname = row[3]
grade = row[4]
studentname = firstname + " " + lastname
studentemail = firstname + "." + lastname + "#mydomain.org"
status = "Active"
position = "Student"
covered = "Yes"
paid = "Yes"
write.writerow([studentnumber, studentname, studentemail, grade, position, status, covered, paid])
def main():
"""
Controls the program execution
:param in_file: the name of the input file.
:return: None
"""
if __name__ == '__main__':
main()
The file generates fine with the way the code is written. I am just unsure what I need to change to add the headers.
Using the csv module, as you are, it's pretty straight forward. Define your headers in an array and then create a DictWriter with the fieldnames set to your array. Reference the following code and documentation:
import csv
with open('names.csv', 'w') as csvfile:
fieldnames = ['first_name', 'last_name']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerow({'first_name': 'Baked', 'last_name': 'Beans'})
writer.writerow({'first_name': 'Lovely', 'last_name': 'Spam'})
writer.writerow({'first_name': 'Wonderful', 'last_name': 'Spam'})
Here's the documentation:
https://docs.python.org/2/library/csv.html#csv.DictWriter

Saving list to a .csv file

I have made a code that opens a .csv file and takes a user input to filter it to a new list. What I am having trouble with is saving this new list to a .csv file properly.
This is my code:
#author: Joakim
from pprint import pprint
import csv
with open ('Change_in_Unemployment_2008-2014.csv') as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
next(readCSV) #Removing header
result = []
found = False
user_input = input("Please enter a full/partial NUTS code to filter by: ")
for row in readCSV:
if row[0].startswith(user_input):
result.append(row)
found = True
if found == False:
print("There are no registered NUTS codes containing your input.. Please try again")
if found == True:
print("\n Successfully found ", len(result), "entries!""\n")
pprint (result)
#store data in a new csv file
Stored_path = "C:\data_STORED.csv"
file = open(Stored_path, 'w')
writer = csv.writer(file)
writer.writerow(["NUTS CODE", " Municipality", " value"])
for i in range(len(result)):
new_row = result[i]
NUTS_CODE = new_row[0]
Municipality = new_row[1]
Value = new_row[2]
writer.writerow([NUTS_CODE, Municipality])
csvfile.close()
If one runs my code with an input of : PL, one gets this list:
[['PL11', 'odzkie', '2.2'],
['PL12', 'Mazowieckie', '1.2'],
['PL21', 'Maopolskie', '2.9'],
['PL22', 'Slaskie', '2'],
['PL31', 'Lubelskie', '1.1'],
['PL32', 'Podkarpackie', '5.8'],
['PL33', 'Swietokrzyskie', '2.6'],
['PL34', 'Podlaskie', '2.7'],
['PL41', 'Wielkopolskie', '1.6'],
['PL42', 'Zachodniopomorskie', '-1.1'],
['PL43', 'Lubuskie', '1.8'],
['PL51', 'Dolnoslaskie', '0'],
['PL52', 'Opolskie', '1.3'],
['PL61', 'Kujawsko-Pomorskie', '1.6'],
['PL62', 'Warminsko-Mazurskie', '2.4'],
['PL63', 'Pomorskie', '3.1']]'
Now I would like to store this neatly into a new .csv file, but when I use the code above, I only get a couple of values repeated throughout.
What is my error?

Resources