Python: Writing into file fails without error - getting an empty file - python-3.x

This code seems to be working fine and the console is giving me exactly what I need without any errors. However, when I try to open it up with excel it gives me an empty file.
import csv
import urllib.request
from bs4 import BeautifulSoup
f = open('aapl_analyst_estimates', 'w', newline= '')
writer = csv.writer(f)
soup = BeautifulSoup(urllib.request.urlopen('https://www.marketwatch.com/investing/stock/aapl/analystestimates').read(), 'lxml')
tbody = soup('table', {'class':'snapshot'})[0].find_all('tr')
for row in tbody:
cols = row.findChildren(recursive=False)
cols = [ele.text.strip() for ele in cols]
writer.writerow(cols)
print(cols)
f.close()

It works fine for me. I get the following output:
['Number of Ratings:', '41', 'Current Quarters Estimate:', '4.17']
['FY Report Date:', '9 / 2019', "Current Year's Estimate:", '11.99']
["Last Quarter's Earnings:", '2.91', 'Median PE on CY Estimate:', '12.88']
['Year Ago Earnings:', '11.75', 'Next Fiscal Year Estimate:', '13.34']
['', '', 'Median PE on Next FY Estimate:', '11.37']

Related

For Loop to CSV Leading to Uneven Rows in Python

Still learning Python, so apologies if this is an extremely obvious mistake. I've been trying to figure it out for hours now though and figured I'd see if anyone can help out.
I've scraped a hockey website for their ice skate name and price and have written it to a CSV. The only problem is that when I write it to CSV the rows for the name column (listed as Gear) and the Price column are not aligned. It goes:
Gear Name 1
Row Space
Price
Row Space
Gear Name 2
It would be great to align the gear and price rows next to each other. I've attached a link to a picture of the CSV as well if that helps.
import requests
from bs4 import BeautifulSoup as Soup
webpage_response = requests.get('https://www.purehockey.com/c/ice-hockey-skates-senior?')
webpage = (webpage_response.content)
parser = Soup(webpage, 'html.parser')
filename = "gear.csv"
f = open(filename, "w")
headers = "Gear, Price"
f.write(headers)
for gear in parser.find_all("div", {"class": "details"}):
gearname = gear.find_all("div", {"class": "name"}, "a")
gearnametext = gearname[0].text
gearprice = gear.find_all("div", {"class": "price"}, "a")
gearpricetext = gearprice[0].text
print (gearnametext)
print (gearpricetext)
f.write(gearnametext + "," + gearpricetext)
[What the uneven rows look like][1]
[1]: https://i.stack.imgur.com/EG2f2.png
Would recommend with python 3 to use with open(filename, 'w') as f: and strip() your texts before write() to your file.
Unless you do not use 'a' mode to append each line you have to add linebreak to each line you are writing.
Example
import requests
from bs4 import BeautifulSoup as Soup
webpage_response = requests.get('https://www.purehockey.com/c/ice-hockey-skates-senior?')
webpage = (webpage_response.content)
parser = Soup(webpage, 'html.parser')
filename = "gear1.csv"
headers = "Gear,Price\n"
with open(filename, 'w') as f:
f.write(headers)
for gear in parser.find_all("div", {"class": "details"}):
gearnametext = gear.find("div", {"class": "name"}).text.strip()
gearpricetext = gear.find("div", {"class": "price"}).text.strip()
f.write(gearnametext + "," + gearpricetext+"\n")
Output
Gear,Price
Bauer Vapor X3.7 Ice Hockey Skates - Senior,$249.99
Bauer X-LP Ice Hockey Skates - Senior,$119.99
Bauer Vapor Hyperlite Ice Hockey Skates - Senior,$999.98 - $1149.98
CCM Jetspeed FT475 Ice Hockey Skates - Senior,$249.99
Bauer X-LP Ice Hockey Skates - Intermediate,$109.99
...
I've noticed that gearnametext returns 2\n inside the string. You should try the method str.replace() to remove the \n which are creating you the jump to the next line. Try with:
import requests
from bs4 import BeautifulSoup as Soup
webpage_response = requests.get('https://www.purehockey.com/c/ice-hockey-skates-senior?')
webpage = (webpage_response.content)
parser = Soup(webpage, 'html.parser')
filename = "gear.csv"
f = open(filename, "w")
headers = "Gear, Price"
f.write(headers)
for gear in parser.find_all("div", {"class": "details"}):
gearname = gear.find_all("div", {"class": "name"}, "a")
gearnametext = gearname[0].text.replace('\n','')
gearprice = gear.find_all("div", {"class": "price"}, "a")
gearpricetext = gearprice[0].text
print (gearnametext)
print (gearpricetext)
f.write(gearnametext + "," + gearpricetext)
I changed inside the loop the second line for the gear name for: gearnametext = gearname[0].text.replace('\n','').

I want to find all the head lines containing certain word/words to be scraped and saved to a text file

How can I use a list of words and make the program pull out any new headings containing any one of the words inside the list. It gives out a error if I try to use the list of key words.
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime, timedelta
s_date = '2018/01/01'
e_date = '2018/01/06'
d1 = datetime.strptime(s_date, '%Y/%m/%d')
d2 = datetime.strptime(e_date, '%Y/%m/%d')
delta = timedelta(days = 1)
date_list = list()
while d1 <= d2:
# print(d1.strftime('%Y/%m/%d'))
date_list.append(d1.strftime('%Y/%m/%d'))
d1 += delta
print(date_list)
for d in date_list:
URL = 'https://www.thedailystar.net/newspaper?date={}'.format(d)
result = requests.get(URL)
src = result.text
soup = BeautifulSoup(src, 'lxml')
# filename = 'new.csv'
# f = open(filename, 'w', newline = '')
# fx = csv.writer(f)
containers = soup.find_all('div',class_ = 'list-content')
key_words = ['Road', 'crash', 'dead', 'accidents']
key_word = input('Enter the desired word to search the news: ')
for c in containers:
headings = c.h5.a.text
if key_word in headings:
print(headings)
with open('nw.txt', 'w') as f:
f.write(headings)
# fx.writerow(headings)
You had several bugs in your code, that's why it didn't work as expected.
Here's the correct version of what you want to achieve:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime, timedelta
s_date = '2018/01/01'
e_date = '2018/01/06'
d1 = datetime.strptime(s_date, '%Y/%m/%d')
d2 = datetime.strptime(e_date, '%Y/%m/%d')
delta = timedelta(days = 1)
date_list = list()
while d1 <= d2:
date_list.append(d1.strftime('%Y/%m/%d'))
d1 += delta
print(date_list)
with open('nw.txt', 'w') as f:
for d in date_list:
URL = 'https://www.thedailystar.net/newspaper?date={}'.format(d)
result = requests.get(URL)
src = result.text
soup = BeautifulSoup(src, 'lxml')
containers = soup.find_all('div',class_ = 'list-content')
key_words = ['Road', 'crash', 'dead', 'accidents']
# key_word = input('Enter the desired word to search the news: ')
for c in containers:
headings = c.h5.a.text
if any(key_word in headings for key_word in key_words):
print(headings)
f.write(headings + '\n')
What's happening (changes are at the bottom):
If you wanted to use a list of keywords (which is called key_words), then an option is to use built-in any function and iterate over all of keywords, checking wherther it is in your current headings.
Also you're open-ing file every time you want to write - it destroys last write and creates a new file. Instead you should open file once before loop.
Plus when you were writing headings to file, you didn't add \n which is the newline symbol - it would cause all headings to append as one row.

CSV file to Excel

I'm trying to scrape a website and get the output in an Excel file. I manage to create the Excel file but the columns are all messed up (please see the pictures).
How should I go about transferring the data correctly from the CSV file to the Excel file?
The code I used:
import requests
import pandas as pd
from bs4 import BeautifulSoup
page = requests.get('https://forecast.weather.gov/MapClick.php?lat=34.05349000000007&lon=-118.24531999999999#.XsTs9RMzZTZ')
soup = BeautifulSoup(page.content, 'html.parser')
week = soup.find(id = 'seven-day-forecast-body')
items = week.find_all(class_='tombstone-container')
period_names = [item.find(class_='period-name').get_text() for item in items]
short_descriptions = [item.find(class_='short-desc').get_text() for item in items]
temperatures = [item.find(class_='temp').get_text() for item in items]
weather_stuff = pd.DataFrame(
{
'period' : period_names,
'short_descriptions' : short_descriptions,
'temperatures' : temperatures,
})
print(weather_stuff)
weather_stuff.to_csv('weather.csv')
A minimilistic working example:
df1 = pd.DataFrame([['a', 'b'], ['c', 'd']],
index=['row 1', 'row 2'],
columns=['col 1', 'col 2'])
df1.to_excel("output.xlsx")
# To specify the sheet name:
df1.to_excel("output.xlsx", sheet_name='Sheet_name_1')
Source: Documentation

How can I use Python to scrape a multipage table and export to a CSV file?

i am trying to scrape a table that spans multiple pages and export to a csv file. only one line of data seems to get exported and it is jumbled up.
I have looked on the web and tried many iterations and very frustrated now. As you can tell from code I am a novice at coding!
import bs4 as bs
import urllib.request
import pandas as pd
import csv
max_page_num = 14
max_page_dig = 1 # number of digits in the page number
with open('result.csv',"w") as f:
f.write("Name, Gender, State, Position, Grad, Club/HS, Rating, Commitment \n")
for i in range(0, max_page_num):
page_num = (max_page_dig - len(str(i))) * "0" +str(i) #gives a string in the format of 1, 01 or 001, 005 etc
print(page_num)
source = "https://www.topdrawersoccer.com/search/?query=&divisionId=&genderId=m&graduationYear=2020&positionId=0&playerRating=&stateId=All&pageNo=" + page_num + "&area=commitments"
print(source)
url = urllib.request.urlopen(source).read()
soup = bs.BeautifulSoup(url,'lxml')
table = soup.find('table')
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
#final = row.strip("\n")
#final = row.replace("\n","")
with open('result.csv', 'a') as f:
f.write(row)
It seems when I write to csv it overwrites previous ones. It also pastes it on one line and the players name is concatenated with the school name . Thanks for any and all help.
I think you have a problem with your inside for loop. Try re-writing it as
with open('result.csv', 'a') as f:
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
f.write(row)
and see if it works.
More generally, this can probably be done more simply by using pandas. Try changing your for loop to:
for i in range(0, max_page_num):
page_num = ...
source = ....
df = pd.read_html(source)
df.to_csv('results.csv', header=False, index=False, mode='a') #'a' should append each table to the csv file, instead of overwriting it.

Python 3.6 CSV Date Formatting d/m/yyyy to ddmmyyyy

I have a date column in a CSV file which I am trying to format from dd/mm/yyyy to ddmmyyyy. Some of the days and months are single digit which leave them as dmyyyy. When I run a print statement all of the rows output correctly.
import csv
with open(r'input file path,'r') as csvfile:
with open(r'outputfilepath,'w') as output:
w = csv.writer(output)
r = csv.reader(csvfile)
for row in r:
#this takes care of incomplete rows at the end
if len(row[6])>1:
dt = row[6].split("/")
n = 0
for n in range(len(dt)):
if len(dt[n])<2:
dt[n] = '0'+dt[n]
else:
dt[n]
row[6] = dt[0]+dt[1]+dt[2]
print(row)
else:
break
Print Output:
['a', '', 'Tom', 'Smith', 'J ', '', '12201956']
['b', '', 'Rick ', 'JOHNSON ', ' ', '', '08121922']
['c', '', 'Morty', 'Harvey', ' ', '', '06031940']
When I change the print to write rows:
import csv
with open(r'input file path,'r') as csvfile:
with open(r'outputfilepath,'w') as output:
w = csv.writer(output)
r = csv.reader(csvfile)
for row in r:
#this takes care of incomplete rows at the end
if len(row[6])>1:
dt = row[6].split("/")
n = 0
for n in range(len(dt)):
if len(dt[n])<2:
dt[n] = '0'+dt[n]
else:
dt[n]
row[6] = dt[0]+dt[1]+dt[2]
w.writerows(row)
else:
break
I get the output below. I've tried moving the writerows function around with no luck. Looking at the CSV module documentation it should delimit on the commas. I'm relatively new to python.
To fix your problem change w.writerows(row) to w.writerow(row). The difference is between the singular and the plural is that the plural version thinks its getting a collection of rows to write. It treats each item in the row you gave as a single row.
Also newline='' to your open because the csv module interacts poorly with universal newline mode on windows. (It tries to write '\r\n'. Universal newline translates that to '\r\r\n'.)
Finally, use datetime to fix your dates.
import csv
from datetime import datetime
with open(inpath, 'r', newline='') as fin:
with open(outpath, 'w', newline='') as fout:
reader = csv.reader(fin)
writer = csv.writer(fout)
for row in reader:
row[6] = datetime.strptime(row[6], '%m/%d/%Y').strftime('%m%dā€Œā€‹%Y')
writer.writerow(row)

Resources