I want to find all the head lines containing certain word/words to be scraped and saved to a text file - python-3.x

How can I use a list of words and make the program pull out any new headings containing any one of the words inside the list. It gives out a error if I try to use the list of key words.
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime, timedelta
s_date = '2018/01/01'
e_date = '2018/01/06'
d1 = datetime.strptime(s_date, '%Y/%m/%d')
d2 = datetime.strptime(e_date, '%Y/%m/%d')
delta = timedelta(days = 1)
date_list = list()
while d1 <= d2:
# print(d1.strftime('%Y/%m/%d'))
date_list.append(d1.strftime('%Y/%m/%d'))
d1 += delta
print(date_list)
for d in date_list:
URL = 'https://www.thedailystar.net/newspaper?date={}'.format(d)
result = requests.get(URL)
src = result.text
soup = BeautifulSoup(src, 'lxml')
# filename = 'new.csv'
# f = open(filename, 'w', newline = '')
# fx = csv.writer(f)
containers = soup.find_all('div',class_ = 'list-content')
key_words = ['Road', 'crash', 'dead', 'accidents']
key_word = input('Enter the desired word to search the news: ')
for c in containers:
headings = c.h5.a.text
if key_word in headings:
print(headings)
with open('nw.txt', 'w') as f:
f.write(headings)
# fx.writerow(headings)

You had several bugs in your code, that's why it didn't work as expected.
Here's the correct version of what you want to achieve:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime, timedelta
s_date = '2018/01/01'
e_date = '2018/01/06'
d1 = datetime.strptime(s_date, '%Y/%m/%d')
d2 = datetime.strptime(e_date, '%Y/%m/%d')
delta = timedelta(days = 1)
date_list = list()
while d1 <= d2:
date_list.append(d1.strftime('%Y/%m/%d'))
d1 += delta
print(date_list)
with open('nw.txt', 'w') as f:
for d in date_list:
URL = 'https://www.thedailystar.net/newspaper?date={}'.format(d)
result = requests.get(URL)
src = result.text
soup = BeautifulSoup(src, 'lxml')
containers = soup.find_all('div',class_ = 'list-content')
key_words = ['Road', 'crash', 'dead', 'accidents']
# key_word = input('Enter the desired word to search the news: ')
for c in containers:
headings = c.h5.a.text
if any(key_word in headings for key_word in key_words):
print(headings)
f.write(headings + '\n')
What's happening (changes are at the bottom):
If you wanted to use a list of keywords (which is called key_words), then an option is to use built-in any function and iterate over all of keywords, checking wherther it is in your current headings.
Also you're open-ing file every time you want to write - it destroys last write and creates a new file. Instead you should open file once before loop.
Plus when you were writing headings to file, you didn't add \n which is the newline symbol - it would cause all headings to append as one row.

Related

extracting columns contents only so that all columns for each row are in the same row using Python's BeautifulSoup

I have the following python snippet in Jupyter Notebooks that works.
The challenge I have is to extract just the rows of columnar data only
Here's the snippet:
from bs4 import BeautifulSoup as bs
import pandas as pd
page = requests.get("http://lib.stat.cmu.edu/datasets/boston")
page
soup = bs(page.content)
soup
allrows = soup.find_all("p")
print(allrows)
I'm a little unclear of what you are after but I think it's each individual row of data from URL provided.
I couldn't find a way to use beautiful soup to parse the data you are after but did find a way to separate the rows using .split()
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
page = requests.get("http://lib.stat.cmu.edu/datasets/boston")
soup = bs(page.content)
allrows = soup.find_all("p")
text = soup.text # turn soup into text
text_split = text.split('\n\n') # split the page into 3 sections
data = text_split[2] # rows of data
# create df column titles using variable titles on page
col_titles = text_split[1].split('\n')
df = pd.DataFrame(columns=range(14))
df.columns = col_titles[1:]
# 'try/except' to catch end of index,
# loop throw text data building complete rows
try:
complete_row = []
n1 = 0 #used to track index
n2 = 1
rows = data.split('\n')
for el in range(len(rows)):
full_row = rows[n1] + rows[n2]
complete_row.append(full_row)
n1 = n1 + 2
n2 = n2 + 2
except IndexError:
print('end of loop')
# loop through rows of data, clean whitespace and append to df
for row in complete_row:
elem = row.split(' ')
df.loc[len(df)] = [el for el in elem if el]
#fininshed dataframe
df

BeautifulSoup, Requests, Dataframe, extracting from <SPAN> and Saving to Excel

Python novice here again! 2 questions:
1) Instead of saving to multiple tabs (currently saving each year to a tab named after the year) how can I save all this data into one sheet in excel called "summary".
2) ('div',class_="sidearm-schedule-game-result") returns the format "W, 1-0". How can I split the "W, 1-0" into two columns, one containing "W" and the next column containing "1-0".
Thanks so much
import requests
import pandas as pd
from pandas import ExcelWriter
from bs4 import BeautifulSoup
import openpyxl
import csv
year_id = ['2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019']
lehigh_url = 'https://lehighsports.com/sports/mens-soccer/schedule/'
results = []
with requests.Session() as req:
for year in range(2003, 2020):
print(f"Extracting Year# {year}")
url = req.get(f"{lehigh_url}{year}")
if url.status_code == 200:
soup = BeautifulSoup(url.text, 'lxml')
rows = soup.find_all('div',class_="sidearm-schedule-game-row flex flex-wrap flex-align-center row")
sheet = pd.DataFrame()
for row in rows:
date = row.find('div',class_="sidearm-schedule-game-opponent-date").text.strip()
name = row.find('div',class_="sidearm-schedule-game-opponent-name").text.strip()
opp = row.find('div',class_="sidearm-schedule-game-opponent-text").text.strip()
conf = row.find('div',class_="sidearm-schedule-game-conference-conference").text.strip()
try:
result = row.find('div',class_="sidearm-schedule-game-result").text.strip()
except:
result = ''
df = pd.DataFrame([[year,date,name,opp,conf,result]], columns=['year','date','opponent','list','conference','result'])
sheet = sheet.append(df,sort=True).reset_index(drop=True)
results.append(sheet)
def save_xls(list_dfs, xls_path):
with ExcelWriter(xls_path) as writer:
for n, df in enumerate(list_dfs):
df.to_excel(writer,'%s' %year_id[n],index=False,)
writer.save()
save_xls(results,'lehigh.xlsx')
Instead of creating a list of dataframes, you can append each sheet into 1 dataframe and write that to file with pandas. Then to split into 2 columns, just use .str.split() and split on the comma.
import requests
import pandas as pd
from bs4 import BeautifulSoup
year_id = ['2019','2018','2017','2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003']
results = pd.DataFrame()
for year in year_id:
url = 'https://lehighsports.com/sports/mens-soccer/schedule/' + year
print (url)
lehigh = requests.get(url).text
soup = BeautifulSoup(lehigh,'lxml')
rows = soup.find_all('div',class_="sidearm-schedule-game-row flex flex-wrap flex-align-center row")
sheet = pd.DataFrame()
for row in rows:
date = row.find('div',class_="sidearm-schedule-game-opponent-date").text.strip()
name = row.find('div',class_="sidearm-schedule-game-opponent-name").text.strip()
opp = row.find('div',class_="sidearm-schedule-game-opponent-text").text.strip()
conf = row.find('div',class_="sidearm-schedule-game-conference-conference").text.strip()
try:
result = row.find('div',class_="sidearm-schedule-game-result").text.strip()
except:
result = ''
df = pd.DataFrame([[year,date,name,opp,conf,result]], columns=['year','date','opponent','list','conference','result'])
sheet = sheet.append(df,sort=True).reset_index(drop=True)
results = results.append(sheet, sort=True).reset_index(drop=True)
results['result'], results['score'] = results['result'].str.split(',', 1).str
results.to_excel('lehigh.xlsx')

How can I use Python to scrape a multipage table and export to a CSV file?

i am trying to scrape a table that spans multiple pages and export to a csv file. only one line of data seems to get exported and it is jumbled up.
I have looked on the web and tried many iterations and very frustrated now. As you can tell from code I am a novice at coding!
import bs4 as bs
import urllib.request
import pandas as pd
import csv
max_page_num = 14
max_page_dig = 1 # number of digits in the page number
with open('result.csv',"w") as f:
f.write("Name, Gender, State, Position, Grad, Club/HS, Rating, Commitment \n")
for i in range(0, max_page_num):
page_num = (max_page_dig - len(str(i))) * "0" +str(i) #gives a string in the format of 1, 01 or 001, 005 etc
print(page_num)
source = "https://www.topdrawersoccer.com/search/?query=&divisionId=&genderId=m&graduationYear=2020&positionId=0&playerRating=&stateId=All&pageNo=" + page_num + "&area=commitments"
print(source)
url = urllib.request.urlopen(source).read()
soup = bs.BeautifulSoup(url,'lxml')
table = soup.find('table')
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
#final = row.strip("\n")
#final = row.replace("\n","")
with open('result.csv', 'a') as f:
f.write(row)
It seems when I write to csv it overwrites previous ones. It also pastes it on one line and the players name is concatenated with the school name . Thanks for any and all help.
I think you have a problem with your inside for loop. Try re-writing it as
with open('result.csv', 'a') as f:
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
f.write(row)
and see if it works.
More generally, this can probably be done more simply by using pandas. Try changing your for loop to:
for i in range(0, max_page_num):
page_num = ...
source = ....
df = pd.read_html(source)
df.to_csv('results.csv', header=False, index=False, mode='a') #'a' should append each table to the csv file, instead of overwriting it.

Import and parse .data file

there is a file I tried to import and safe as pandas df. At a first sight looks like it's already columns and rows ordered, but finally I had to do a bunch of stuff to create pandas df. Could you please check if there is much faster way to manage it?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
My way of doing it is:
import requests
import pandas as pd
r = requests.get(url)
file = r.text
step_1 = file.split('\n')
for n in range(len(step_1)): # remove empty strings
if bool(step_1[n]) == False:
del(step_1[n])
step_2 = [i.split('\t') for i in step_1]
cars_names = [i[1] for i in step_2]
step_3 = [i[0].split(' ') for i in step_2]
for e in range(len(step_3)): # remove empty strings in each sublist
step_3[e] = [item for item in step_3[e] if item != '']
mpg = [i[0] for i in step_3]
cylinders = [i[1] for i in step_3]
disp = [i[2] for i in step_3]
horsepower = [i[3] for i in step_3]
weight = [i[4] for i in step_3]
acce = [i[5] for i in step_3]
year = [i[6] for i in step_3]
origin = [i[7] for i in step_3]
list_cols = [cars_names, mpg, cylinders, disp, horsepower, weight, acce, year, origin]
# list_labels written manually:
list_labels = ['car name', 'mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin']
zipped = list(zip(list_labels, list_cols))
data = dict(zipped)
df = pd.DataFrame(data)
When you replaced \t to blankspace, you can use read_csv to read it. But you need to wrap up your text, because the first parameter in read_csv is filepath_or_buffer which needs object with a read() method (such as a file handle or StringIO). Then your question can be transform to read_csv doesn't read the column names correctly on this file?
import requests
import pandas as pd
from io import StringIO
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
r = requests.get(url)
file = r.text.replace("\t"," ")
# list_labels written manually:
list_labels = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin','car name']
df = pd.read_csv(StringIO(file),sep="\s+",header = None,names=list_labels)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(df)

How do i sort a text file by column numerically?

from lxml import html
import operator
import discord
import yaml
import csv
raw_json =
requests.get('https://bittrex.com/api/v1.1/public/getmarketsummaries').text
json_dict = json.loads(raw_json)
stuff = json_dict["result"]
new = []
for i in range(0,197):
price = (stuff[i]['Last'])
name1 = (stuff[i]['MarketName'])
name = name1.replace("BTC-", "")
prev = (stuff[i]['PrevDay'])
diff = price - prev
change = round(((price - prev) / price) * 100, 2)
final = ('{0},{1}'.format(name,change))
new.append(final)
butFirst = new[0:]
this1 = ("\n".join(butFirst))
text_file = open("Sort.txt", "w")
text_file.write(this1)
text_file.close()
Im having problems sorting this output in second column..
I get base 10 errors.. integer errors etc.. i think the problem
is how the number is stored but i cant figure it out.
output looks like this>
1ST,-5.94
2GIVE,3.45
ABY,2.44
ADA,0.0
ADT,-4.87
ADX,-13.09
AEON,-2.86
AGRS,-2.0
You should avoid changing your data to text earlier than you need to. If you operate with a list of dictionaries it's very easy to sort the list.
import json
import csv
import requests
raw_json = requests.get('https://bittrex.com/api/v1.1/public/getmarketsummaries').text
json_dict = json.loads(raw_json)
stuff = json_dict["result"]
new = []
for i in range(0,197):
price = float(stuff[i]['Last'])
prev = float(stuff[i]['PrevDay'])
# Use dictionary to hold the data
d = {
'name' : stuff[i]['MarketName'].replace("BTC-", ""),
'change' : round(((price - prev) / price) * 100, 2)
}
new.append(d)
# The actual sorting part, sorting by change
sorted_list = sorted(new, key=lambda k: k['change'])
# Writing the dictionaries to file
with open("Sort.txt", "w") as text_file:
dict_writer = csv.DictWriter(text_file, sorted_list[0].keys())
# include the line below, if you want headers
# dict_writer.writeheader()
dict_writer.writerows(sorted_list)

Resources