removing certain characters from entries within a list - python-3.x

I need to remove the [',\n\xa0 characters and the years (1994) from this entry in the list then iterate over the list doing this to each entry.
is there a way I can do this? I'm newish to python and have been trying for hours
The entries are like so :
[['The Shawshank Redemption\n(1994)\n\n\n 9.2\xa0\xa0\n\n'], ['The Godfather\n(1972)\n\n\n 9.2\xa0\xa0\n\n'], ['The Godfather: Part II\n(1974)\n\n\n 9.0\xa0\xa0\n\n'],
edit: sorry for not including the code, iv managed to strip the numbers and the \n newline characters after the year. but still getting newlines character just after the film title. ill paste my code anwyway thanks!:
from bs4 import BeautifulSoup
import requests
import random
names = []
newList = []
url = 'http://m.imdb.com/chart/top'
# get contents from url
content = requests.get(url).content
# get soup
soup = BeautifulSoup(content,'lxml') # choose lxml parser
# find all the references
ref_tags = soup.findAll('span', { 'class' : 'media-body' })
realTags = soup.find_all("h4")
# iterate through the ResultSet
for i,ref_tag in enumerate(ref_tags):
# print text only
names.append('[{0}] {1}'.format(i,ref_tag.text))
pos = 0
for name in names:
newName = names[pos]
newName = newName[9:]
newName = newName[:100]
newName = newName.split("(")
newName = newName[::2]
del newName[2:9:3]
newList.append(newName)
pos = pos + 1
print(newList)
choice = random.choice(newList)
print(choice)
the output is like this:
[['The Shawshank Redemption\n'], ['The Godfather\n'], ['The Godfather: Part II\n'], ['The Dark Knight\n'], ['12 Angry Men\n']

so i got it to output how i would like. thanks any!
heres the code for anyone who may need it in future:
from bs4 import BeautifulSoup
import requests
import random
names = []
newList = []
url = 'http://m.imdb.com/chart/top'
# get contents from url
content = requests.get(url).content
# get soup
soup = BeautifulSoup(content,'lxml') # choose lxml parser
# find all the references
ref_tags = soup.findAll('span', { 'class' : 'media-body' })
realTags = soup.find_all("h4")
# iterate through the ResultSet
for i,ref_tag in enumerate(ref_tags):
# print text only
names.append('[{0}] {1}'.format(i,ref_tag.text))
pos = 0
for name in names:
newName = names[pos]
newName = newName[9:]
newName = newName[:100]
newName = newName.split("(")
newName = newName[::2]
del newName[2:9:3]
newList.append(newName)
pos = pos + 1
wordChoice = random.choice(newList)
str = str(wordChoice)
editWord = str.split("\\n")
print(editWord[1])
and the output is like so:
Shutter Island

Related

I want to find all the head lines containing certain word/words to be scraped and saved to a text file

How can I use a list of words and make the program pull out any new headings containing any one of the words inside the list. It gives out a error if I try to use the list of key words.
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime, timedelta
s_date = '2018/01/01'
e_date = '2018/01/06'
d1 = datetime.strptime(s_date, '%Y/%m/%d')
d2 = datetime.strptime(e_date, '%Y/%m/%d')
delta = timedelta(days = 1)
date_list = list()
while d1 <= d2:
# print(d1.strftime('%Y/%m/%d'))
date_list.append(d1.strftime('%Y/%m/%d'))
d1 += delta
print(date_list)
for d in date_list:
URL = 'https://www.thedailystar.net/newspaper?date={}'.format(d)
result = requests.get(URL)
src = result.text
soup = BeautifulSoup(src, 'lxml')
# filename = 'new.csv'
# f = open(filename, 'w', newline = '')
# fx = csv.writer(f)
containers = soup.find_all('div',class_ = 'list-content')
key_words = ['Road', 'crash', 'dead', 'accidents']
key_word = input('Enter the desired word to search the news: ')
for c in containers:
headings = c.h5.a.text
if key_word in headings:
print(headings)
with open('nw.txt', 'w') as f:
f.write(headings)
# fx.writerow(headings)
You had several bugs in your code, that's why it didn't work as expected.
Here's the correct version of what you want to achieve:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime, timedelta
s_date = '2018/01/01'
e_date = '2018/01/06'
d1 = datetime.strptime(s_date, '%Y/%m/%d')
d2 = datetime.strptime(e_date, '%Y/%m/%d')
delta = timedelta(days = 1)
date_list = list()
while d1 <= d2:
date_list.append(d1.strftime('%Y/%m/%d'))
d1 += delta
print(date_list)
with open('nw.txt', 'w') as f:
for d in date_list:
URL = 'https://www.thedailystar.net/newspaper?date={}'.format(d)
result = requests.get(URL)
src = result.text
soup = BeautifulSoup(src, 'lxml')
containers = soup.find_all('div',class_ = 'list-content')
key_words = ['Road', 'crash', 'dead', 'accidents']
# key_word = input('Enter the desired word to search the news: ')
for c in containers:
headings = c.h5.a.text
if any(key_word in headings for key_word in key_words):
print(headings)
f.write(headings + '\n')
What's happening (changes are at the bottom):
If you wanted to use a list of keywords (which is called key_words), then an option is to use built-in any function and iterate over all of keywords, checking wherther it is in your current headings.
Also you're open-ing file every time you want to write - it destroys last write and creates a new file. Instead you should open file once before loop.
Plus when you were writing headings to file, you didn't add \n which is the newline symbol - it would cause all headings to append as one row.

adding "na" text to an array within a loop

I've gotten all the data I wanted from scraping this metacritc url (see below) however, I can't seem to put a value in for when I don't find the associated value for list (missing values)
I would like to have it so all the lists are even (so I can right to .csv)
Here is the code I have so far:
from requests import get
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import pandas as pd
#Define year
year_number = 2018
# Define the URL
i = range(0, 1)
names = []
metascores = []
userscores = []
userscoresNew = []
release_dates = []
release_datesNew = []
publishers = []
ratings = []
genres = []
genresNew = []
for element in i:
url = "http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=" + format(year_number)
print(url)
year_number -= 1
# not sure about this but it works (I was getting blocked by something and this the way I found around it)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
#this grabs the all the text from the page
html_soup = BeautifulSoup(webpage, 'html5lib')
#this is for selecting all the games in from 1 to 100 (the list of them)
game_names = html_soup.find_all("div", class_="main_stats")
game_metas = html_soup.find_all("a", class_="basic_stat product_score")
game_users = html_soup.find_all("li", class_='stat product_avguserscore')
game_releases = html_soup.find_all("ul", class_='more_stats')
game_publishers = html_soup.find_all("li", class_='stat publisher')
game_ratings = html_soup.find_all("li", class_='stat maturity_rating')
game_genres = html_soup.find_all("li", class_='stat genre')
#Extract data from each game
for games in game_names:
name = games.find()
names.append(name.text.strip())
for games2 in game_metas:
metascore = games2.find()
metascores.append(metascore.text.strip())
for games3 in game_releases:
release_date = games3.find()
release_dates.append(release_date.text.strip())
for games4 in game_users:
userscore = games4.find('span', class_="data textscore textscore_favorable") or games4.find('span', class_="data textscore textscore_mixed")
if userscore:
userscores.append(userscore.text)
for games5 in game_publishers:
publisher = games5.find("span", class_ = "data")
if publisher:
publishers.append(publisher.text)
for games6 in game_ratings:
rating = games6.find("span", class_ = "data")
for games7 in game_genres:
genre = games7.find("span", class_ = "data")
if genre:
genres.append(genre.text)
for x in release_dates:
temp = str(x)
temp2 = temp.replace("Release Date:\n ", "")
release_datesNew.append(temp2)
for z in genres:
temp3 = str(z)
temp4 = temp3.strip()
temp5 = temp4.replace(" ", "")
genresNew.append(temp5)
df = pd.DataFrame({'Games:': names})
not sure how I would work that in to this code
From what I understand it's take all the data it can find but if there is a blank it doesn't know about it
can someone adivse the best solution for this situation
any help would be great
Thanks
Just add else's for the existing conditions...
if userscore:
userscores.append(userscore.text)
else:
userscores.append('na')

I'm stumped at looping through a returned list of URLs

My first python project, I'm trying to scrape restaurant inspection. One site has summaries that offer keys to the detailed reports that I want to scrape. I'm stumped at looping through the keyed list of urls to get the details.
import pandas as pd
import bs4
import datetime
import re
import lxml
from urllib.request import urlopen
from urllib.error import HTTPError
try:
insp = pd.read_csv("ftp://dbprftp.state.fl.us/pub/llweb/5fdinspi.csv",
usecols=[2,14,18,80,81])
except IOError:
print("The file is not accessible.")
insp.columns = ["CountyName", "InspectDate",
"NumHighVio", "LicenseID", "VisitID"]
# filter for alachua county restaurants
alachua = insp[insp.CountyName == 'Alachua']
# filter for restaurants that had at least one serious violation
alachua = alachua[alachua.NumHighVio > 0]
# change date string to date object
alachua['InspectDate'] = pd.to_datetime(alachua['InspectDate'])
# sort most recent
alachua = alachua.sort_values('InspectDate', ascending=False)
# prefer to have user set timedelta below:
today = pd.to_datetime('today')
startDay = datetime.date.today() - datetime.timedelta(days=30)
alachua = alachua[(alachua['InspectDate'] > startDay) &
(alachua['InspectDate'] < today)]
# takes LicenseID and VisitID, passes it into the urls for detailed reports
for index, rows in alachua.iterrows():
visitID = rows['VisitID']
licID = rows['LicenseID']
urls = "https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=
%s &licid= %s" % (visitID, licID)
urls = urls.replace(' ', '')
print(urls)
## here's my problem:
for url in urls:
def get_inspect_detail():
html = urlopen(url)
soup = bs4.BeautifulSoup(html.read(), 'lxml')
details = soup.find_all('font', {'face':'verdana'})[10:]
for detail in details:
siteName = details[0].text
licNum = details[2].text
siteRank = details[4].text
expDate = details[6].text
primeStatus = details[8].text
secStatus = details[10].text
siteAddress = details[12].text
inspectResult = details[20].text
observed1 = details[34].get_text
observed2 = details[36].text
observed3 = details[38].text
observed4 = details[40].text
observed5 = details[42].text
observed6 = details[44].text
observed7 = details[46].text
observed8 = details[48].text
observed9 = details[50].text
observed10 = details[52].text
detailsLib = {
'Restaurant': siteName,
'License': licNum,
'Rank': siteRank,
'Expires': expDate,
'Primary': primeStatus,
'Secondary': secStatus,
'Address': siteAddress,
'Result': inspectResult,
'Observed1': observed1,
'Observed2': observed2,
'Observed3': observed3,
'Observed4': observed4,
'Observed5': observed5,
'Observed6': observed6,
'Observed7': observed7,
'Observed8': observed8,
'Observed9': observed9,
'Observed10': observed10
}
repr(get_inspect_detail())
Probably an obvious mistake or lack of knowledge, but I can get the unscrubbed data for one url, but not for all.
I dont see a reason to define your function inside the loop. You would end up with a lot of redundant definitions this way. Second, you could just define a result list and accumulate the detailsLib objects inside it.
def get_inspect_detail(url):
html = urlopen(url)
soup = bs4.BeautifulSoup(html.read(), 'lxml')
details = soup.find_all('font', {'face': 'verdana'})[10:]
result = []
for detail in details:
siteName = details[0].text
licNum = details[2].text
siteRank = details[4].text
expDate = details[6].text
primeStatus = details[8].text
secStatus = details[10].text
siteAddress = details[12].text
inspectResult = details[20].text
observed1 = details[34].get_text
observed2 = details[36].text
observed3 = details[38].text
observed4 = details[40].text
observed5 = details[42].text
observed6 = details[44].text
observed7 = details[46].text
observed8 = details[48].text
observed9 = details[50].text
observed10 = details[52].text
detailsLib = {
'Restaurant': siteName,
'License': licNum,
'Rank': siteRank,
'Expires': expDate,
'Primary': primeStatus,
'Secondary': secStatus,
'Address': siteAddress,
'Result': inspectResult,
'Observed1': observed1,
'Observed2': observed2,
'Observed3': observed3,
'Observed4': observed4,
'Observed5': observed5,
'Observed6': observed6,
'Observed7': observed7,
'Observed8': observed8,
'Observed9': observed9,
'Observed10': observed10
}
result.append(detailsLib)
return result
for url in urls:
repr(get_inspect_detail(url))

How do i sort a text file by column numerically?

from lxml import html
import operator
import discord
import yaml
import csv
raw_json =
requests.get('https://bittrex.com/api/v1.1/public/getmarketsummaries').text
json_dict = json.loads(raw_json)
stuff = json_dict["result"]
new = []
for i in range(0,197):
price = (stuff[i]['Last'])
name1 = (stuff[i]['MarketName'])
name = name1.replace("BTC-", "")
prev = (stuff[i]['PrevDay'])
diff = price - prev
change = round(((price - prev) / price) * 100, 2)
final = ('{0},{1}'.format(name,change))
new.append(final)
butFirst = new[0:]
this1 = ("\n".join(butFirst))
text_file = open("Sort.txt", "w")
text_file.write(this1)
text_file.close()
Im having problems sorting this output in second column..
I get base 10 errors.. integer errors etc.. i think the problem
is how the number is stored but i cant figure it out.
output looks like this>
1ST,-5.94
2GIVE,3.45
ABY,2.44
ADA,0.0
ADT,-4.87
ADX,-13.09
AEON,-2.86
AGRS,-2.0
You should avoid changing your data to text earlier than you need to. If you operate with a list of dictionaries it's very easy to sort the list.
import json
import csv
import requests
raw_json = requests.get('https://bittrex.com/api/v1.1/public/getmarketsummaries').text
json_dict = json.loads(raw_json)
stuff = json_dict["result"]
new = []
for i in range(0,197):
price = float(stuff[i]['Last'])
prev = float(stuff[i]['PrevDay'])
# Use dictionary to hold the data
d = {
'name' : stuff[i]['MarketName'].replace("BTC-", ""),
'change' : round(((price - prev) / price) * 100, 2)
}
new.append(d)
# The actual sorting part, sorting by change
sorted_list = sorted(new, key=lambda k: k['change'])
# Writing the dictionaries to file
with open("Sort.txt", "w") as text_file:
dict_writer = csv.DictWriter(text_file, sorted_list[0].keys())
# include the line below, if you want headers
# dict_writer.writeheader()
dict_writer.writerows(sorted_list)

Socket Error Exceptions in Python when Scraping

I am trying to learn scraping,
I use exceptions lower down in the code to pass through errors because they dont affect the writing of data to csv
I keep getting a "socket.gaierror" but in the handling of that there is a "urllib.error.URLError" in the handling of that I get "NameError: name 'socket' is not defined" which seems circuitous
I kind of understand that using these exceptions may not be the best way to run the code but I cant seem to get past these errors and I dont know a way around or how to fix the errors.
If you have any suggestions outside of fixing the error exceptions that would be greatly appreciated as well.
import csv
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
base_url = 'http://www.fangraphs.com/' # used in line 27 for concatenation
years = ['2017','2016','2015'] # for enough data to run tests
#Getting Links for letters
player_urls = []
data = urlopen('http://www.fangraphs.com/players.aspx')
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a'):
if link.has_attr('href'):
player_urls.append(base_url + link['href'])
#Getting Alphabet Links
test_for_playerlinks = 'players.aspx?letter='
player_alpha_links = []
for i in player_urls:
if test_for_playerlinks in i:
player_alpha_links.append(i)
# Getting Player Links
ind_player_urls = []
for l in player_alpha_links:
data = urlopen(l)
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a'):
if link.has_attr('href'):
ind_player_urls.append(link['href'])
#Player Links
jan = 'statss.aspx?playerid'
players = []
for j in ind_player_urls:
if jan in j:
players.append(j)
# Building Pitcher List
pitcher = 'position=P'
pitchers = []
pos_players = []
for i in players:
if pitcher in i:
pitchers.append(i)
else:
pos_players.append(i)
# Individual Links to Different Tables Sorted by Base URL differences
splits = 'http://www.fangraphs.com/statsplits.aspx?'
game_logs = 'http://www.fangraphs.com/statsd.aspx?'
split_pp = []
gamel = []
years = ['2017','2016','2015']
for i in pos_players:
for year in years:
split_pp.append(splits + i[12:]+'&season='+ year)
gamel.append(game_logs+ i[12:] + '&type=&gds=&gde=&season=' + year)
split_pitcher = []
gl_pitcher = []
for i in pitchers:
for year in years:
split_pitcher.append(splits + i[12:]+'&season=' + year)
gl_pitcher.append(game_logs + i[12:] + '&type=&gds=&gde=&season=' + year)
# Splits for Pitcher Data
row_sp = []
rows_sp = []
try:
for i in split_pitcher:
sauce = urlopen(i)
soup = BeautifulSoup(sauce, "html.parser")
table1 = soup.find_all('strong', {"style":"font-size:15pt;"})
row_sp = []
for name in table1:
nam = name.get_text()
row_sp.append(nam)
table = soup.find_all('table', {"class":"rgMasterTable"})
for h in table:
he = h.find_all('tr')
for i in he:
td = i.find_all('td')
for j in td:
row_sp.append(j.get_text())
rows_sp.append(row_sp)
except(RuntimeError, TypeError, NameError, URLError, socket.gaierror):
pass
try:
with open('SplitsPitchingData2.csv', 'w') as fp:
writer = csv.writer(fp)
writer.writerows(rows_sp)
except(RuntimeError, TypeError, NameError):
pass
I'm guessing your main problem was that you - without any sleep what so ever - queried the site for a huge amount of invalid urls (you create 3 urls for the years 2015-2017 for 22880 pitchers in total, but most of these do not fall within that scope so you have tens of thousands of queries that return errors).
I'm surprised your IP wasn't banned by site admin. That said: It would be better to do some filtering so you avoid all those error queries...
The filter I applied is not perfect. It checks if the years in the list either appears in the start or end the years given on the site (e.g. '2004 - 2015'). This also creates error links but no way near the amount the original script did.
In code it could look like this:
from urllib.request import urlopen
from bs4 import BeautifulSoup
from time import sleep
import csv
base_url = 'http://www.fangraphs.com/'
years = ['2017','2016','2015']
# Getting Links for letters
letter_links = []
data = urlopen('http://www.fangraphs.com/players.aspx')
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a'):
try:
link = base_url + link['href']
if 'players.aspx?letter=' in link:
letter_links.append(link)
except:
pass
print("[*] Retrieved {} links. Now fetching content for each...".format(len(letter_links)))
# the data resides in two different base_urls:
splits_url = 'http://www.fangraphs.com/statsplits.aspx?'
game_logs_url = 'http://www.fangraphs.com/statsd.aspx?'
# we need (for some reason) players in two lists - pitchers_split and pitchers_game_log - and the rest of the players in two different, pos_players_split and pis_players_game_log
pos_players_split = []
pos_players_game_log = []
pitchers_split = []
pitchers_game_log = []
# and if we wanted to do something with the data from the letter_queries, lets put that in a list for safe keeping:
ind_player_urls = []
current_letter_count = 0
for link in letter_links:
current_letter_count +=1
data = urlopen(link)
soup = BeautifulSoup(data, "html.parser")
trs = soup.find('div', class_='search').find_all('tr')
for player in trs:
player_data = [tr.text for tr in player.find_all('td')]
# To prevent tons of queries to fangraph with invalid years - check if elements from years list exist with the player stat:
if any(year in player_data[1] for year in years if player_data[1].startswith(year) or player_data[1].endswith(year)):
href = player.a['href']
player_data.append(base_url + href)
# player_data now looks like this:
# ['David Aardsma', '2004 - 2015', 'P', 'http://www.fangraphs.com/statss.aspx?playerid=1902&position=P']
ind_player_urls.append(player_data)
# build the links for game_log and split
for year in years:
split = '{}{}&season={}'.format(splits_url,href[12:],year)
game_log = '{}{}&type=&gds=&gde=&season={}'.format(game_logs_url, href[12:], year)
# checking if the player is pitcher or not. We're append both link and name (player_data[0]), so we don't need to extract name later on
if 'P' in player_data[2]:
pitchers_split.append([player_data[0],split])
pitchers_game_log.append([player_data[0],game_log])
else:
pos_players_split.append([player_data[0],split])
pos_players_game_log.append([player_data[0],game_log])
print("[*] Done extracting data for players for letter {} out of {}".format(current_letter_count, len(letter_links)))
sleep(2)
# CONSIDER INSERTING CSV-PART HERE....
# Extracting and writing pitcher data to file
with open('SplitsPitchingData2.csv', 'a') as fp:
writer = csv.writer(fp)
for i in pitchers_split:
try:
row_sp = []
rows_sp = []
# all elements in the pitchers_split are lists. Player name is i[1]
data = urlopen(i[1])
soup = BeautifulSoup(data, "html.parser")
# append name to row_sp from pitchers_split
row_sp.append(i[0])
# the page has 3 tables with the class rgMasterTable, the first i Standard, the second Advanced, the 3rd Batted Ball
# we're only grabbing standard
table_standard = soup.find_all('table', {"class":"rgMasterTable"})[0]
trs = table_standard.find_all('tr')
for tr in trs:
td = tr.find_all('td')
for content in td:
row_sp.append(content.get_text())
rows_sp.append(row_sp)
writer.writerows(rows_sp)
sleep(2)
except Exception as e:
print(e)
pass
Since I'm not sure precisely how you wanted the data formatted on output you need some work on that.
If you want to avoid waiting for all letter_links to be extracted before you retrieve the actual pitcher stats (and fine tune your output) you can move the csv writer part up, so it runs as a part of the letter loop. If you do this don't forget to empty the pitchers_split list before grabbing another letter_link...

Resources