Web Scraping with BeautifulSoup code review - python-3.x

from bs4 import BeautifulSoup
import requests
import pandas as pd
records=[]
keep_looking = True
url = 'https://www.tapology.com/fightcenter'
while keep_looking:
re = requests.get(url)
soup = BeautifulSoup(re.text,'html.parser')
data = soup.find_all('section',attrs={'class':'fcListing'})
for d in data:
event = d.find('a').text
date = d.find('span',attrs={'class':'datetime'}).text[1:-4]
location = d.find('span',attrs={'class':'venue-location'}).text
mainEvent = first.find('span',attrs={'class':'bout'}).text
url_tag = soup.find('div',attrs={'class':'fightcenterEvents'})
if not url_tag:
keep_looking = False
else:
url = "https://www.tapology.com" + url_tag.find('a')['href']
I am wondering if there are any errors in my code? It is running, but it is taking a very long time to finish and I am afraid it might be stuck in an infinity loop. Please any feedback would be helpful. Please do not rewrite all of this and post, as I would like to keep this format, as I am learning and want to improve.

Although this is not the right site to seek help for review related task, I considered giving a solution as it sounds that you may fall in an infinite loop according to your statement above.
Try this to get information from that site. It will run until there is a next page link to traverse. When there is no more new page link to follow, the script will automatically stop.
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
url = 'https://www.tapology.com/fightcenter'
while True:
re = requests.get(url)
soup = BeautifulSoup(re.text,'html.parser')
for data in soup.find_all('section',attrs={'class':'fcListing'}):
event = data.select_one('.name a').get_text(strip=True)
date = data.find('span',attrs={'class':'datetime'}).get_text(strip=True)[:-1]
location = data.find('span',attrs={'class':'venue-location'}).get_text(strip=True)
try:
mainEvent = data.find('span',attrs={'class':'bout'}).get_text(strip=True)
except AttributeError: mainEvent = ""
print(f'{event} {date} {location} {mainEvent}')
urltag = soup.select_one('.pagination a[rel="next"]')
if not urltag: break #as soon as it finds that there is no next page link, it will break out of the loop
url = urljoin(url,urltag.get("href")) #applied urljoin to save you from using hardcoded prefix
For future reference: feel free to post any question in this site to get your code reviewed.

Related

How Can I Assign A Variable To All Of The Items In A List?

I'm following a guide and it's saying to print the first item from an html document that contains the dollar sign.
It seems to do it correctly, outputting a price to the terminal and it actually being present on the webpage. However, I don't want to have just that single listing, I want to have all of the listings and print them to the terminal.
I'm almost positive that you could do this with a for loop, but I don't know how to set that up correctly. Here's the code I have so far with a comment on line 14, and the code I'm asking about on line 15.
from bs4 import BeautifulSoup
import requests
import os
os.system("clear")
url = 'https://www.newegg.com/p/pl?d=RTX+3080'
result = requests.get(url)
doc = BeautifulSoup(result.text, "html.parser")
prices = doc.find_all(text="$")
#Print all prices instead of just the specified number?
parent = prices[0].parent
strong = parent.find("strong")
print(strong.string)
You could try the following:
from bs4 import BeautifulSoup
import requests
import os
os.system("clear")
url = 'https://www.newegg.com/p/pl?d=RTX+3080'
result = requests.get(url)
doc = BeautifulSoup(result.text, "html.parser")
prices = doc.find_all(text="$")
for price in prices:
parent = price.parent
strong = parent.find("strong")
print(strong.string)

Python requests pull not always retrieving data

To practice programming, I am trying to help a friend review a subreddit's data via webscraping and requests and bs4. (I prefer requests for this task since I am moving this script over to my Rasberry Pi and don't think its little heart could even get Chrome installed.
I am running into an issue where my requests only outputs the results sometimes, meaning it will pull the name and url of the post, say 1 out of 5 times when run. When the request returns no data, it doesn't return an error, the program just stops.
from time import sleep
import requests
import os
import re
i = 1
selections = ""
r = requests.get("https://www.reddit.com/r/hardwareswap/search?q=Vive&restrict_sr=1", timeout = None)
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all('a', attrs = {'data-click-id':'body'})
textitems = []
for result in results:
textitems.append(result.text.strip())
for result in textitems:
print(result)
links = soup.find_all('a', attrs = {'data-click-id':'body'})
for link in links:
print(link.attrs['href'])
Any thoughts as to why this happens? My initial thoughts were it was either due to a reddit policy or an invalid URL.
Thanks!

Using Beautifulsoup to parse a big comment?

I'm using BS4 to parse this webpage:
You'll notice there are two separate tables on the page. Here's the relevant snipped of my code, which is successfully returning the data I want from the first table, but does not find anything from the second table:
# import packages
import urllib3
import certifi
from bs4 import BeautifulSoup
import pandas as pd
#settings
http = urllib3.PoolManager(
cert_reqs='CERT_REQUIRED',
ca_certs=certifi.where())
gamelog_offense = []
#scrape the data and write the .csv files
url = "https://www.sports-reference.com/cfb/schools/florida/2018/gamelog/"
response = http.request('GET', url)
soup = BeautifulSoup(response.data, features="html.parser")
cnt = 0
for row in soup.findAll('tr'):
try:
col=row.findAll('td')
Pass_cmp = col[4].get_text()
Pass_att = col[5].get_text()
gamelog_offense.append([Pass_cmp, Pass_att])
cnt += 1
except:
pass
print("Finished writing with " + str(cnt) + " records")
Finished writing with 13 records
I've verified the data from the SECOND table is contained within the soup (I can see it!). After lots of troubleshooting, I've discovered that the entire second table is completely contained within one big comment(why?). I've managed to extract this comment into a single comment object using the code below, but can't figure out what to do with it after that to extract the data I want. Ideally, I'd like to parse the comment in same way I'm successfully parsing the first table. I've tried using the ideas from similar stack overflow questions (selenium, phantomjs)...no luck.
import bs4
defense = soup.find(id="all_defense")
for item in defense.children:
if isinstance(item, bs4.element.Comment):
big_comment = item
print(big_comment)
<div class="table_outer_container">
<div class="overthrow table_container" id="div_defense">
...and so on....
Posting an answer here in case others find helpful. Many thanks to #TomasCarvalho for directing me to find a solution. I was able to pass the big comment as html into a second soup instance using the following code, and then just use the original parsing code on the new soup instance. (note: the try/except is because some of the teams have no gamelog, and you can't call .children on a NoneType.
try:
defense = soup.find(id="all_defense")
for item in defense.children:
if isinstance(item, bs4.element.Comment):
html = item
Dsoup = BeautifulSoup(html, features="html.parser")
except:
html = ''
Dsoup = BeautifulSoup(html, features="html.parser")

find() in Beautifulsoup returns None

I'm very new to programming in general and I'm trying to write my own little torrent leecher. I'm using Beautifulsoup In order to extract the title and the magnet link of a torrent file. However find() element keeps returning none no matter what I do. The page is correct. I've also tested with find_next_sibling and read all the similar questions but to no avail. Since there are no errors I have no idea what my mistake is.
Any help would be much appreciated. Below is my code:
import urllib3
from bs4 import BeautifulSoup
print("Please enter the movie name: \n")
search_string = input("")
search_string.rstrip()
search_string.lstrip()
open_page = ('https://www.yify-torrent.org/search/' + search_string + '/s-1/all/all/') # get link - creates a search string with input value
print(open_page)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
manager = urllib3.PoolManager(10)
page_content = manager.urlopen('GET',open_page)
soup = BeautifulSoup(page_content,'html.parser')
magnet = soup.find('a', attrs={'class': 'movielink'}, href=True)
print(magnet)
Check out the following script which does exactly what you wanna achieve. I used requests library instead of urllib3. The main mistake you made is that you looked for the magnet link in the wrong place. You need to go one layer deep to dig out that link. Try using quote instead of string manipulation to fit your search query within the url.
Give this a shot:
import requests
from urllib.parse import urljoin
from urllib.parse import quote
from bs4 import BeautifulSoup
keyword = 'The Last Of The Mohicans'
url = 'https://www.yify-torrent.org/search/'
base = f"{url}{quote(keyword)}{'/p-1/all/all/'}"
res = requests.get(base)
soup = BeautifulSoup(res.text,'html.parser')
tlink = urljoin(url,soup.select_one(".img-item .movielink").get("href"))
req = requests.get(tlink)
sauce = BeautifulSoup(req.text,"html.parser")
title = sauce.select_one("h1[itemprop='name']").text
magnet = sauce.select_one("a#dm").get("href")
print(f"{title}\n{magnet}")

Scraping the stackoverflow user data

import requests
from bs4 import BeautifulSoup
import csv
response = requests.get('https://stackoverflow.com/users?page=3&tab=reputation&filter=week').text
soup = BeautifulSoup(response, 'lxml')
for items in soup.select('.user-details'):
name = items.select("a")[0].text
location = items.select(".user-location")[0].text
reputation = items.select(".reputation-score")[0].text
print(name,location,reputation)
with open('stackdata.csv','a',newline='') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([name,location,reputation])
When we change the url of this code the output remains same.
I came across a similar problem. The solution that works for me is using selenium. Though I used headless browser i.e phantomjs I assume it should work for other browsers too.
driver = webdriver.PhantomJS('/home/practice/selenium/webdriver/phantomjs/bin/phantomjs')
users = []
page_num = 1
driver.get('https://stackoverflow.com/users?page={page_num}&tab=reputation&filter=week'.format(page_num=page_num))
content = driver.find_element_by_id('content')
for details in content.find_elements_by_class_name('user-details'):
users.append(details.text)
print(users)
Change the page_num to get the desired result.
Hope this will help!

Resources