Web Scraping using beautifulsoup (error in For Loop) - python-3.x

I am new to Python programming, trying to do web scraping just for learning using Beautifulsoup, applying an iterator using FOR Loop, but I guess it is running only for the one time, and the next time it is showing me some error, tried a lot but was not able to resolve.
Below is my Code -
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = 'https://www.packtpub.com/all'
page = urlopen(url)
soup_packtpage = BeautifulSoup(page,'lxml')
page.close()
all_book = soup_packtpage.find_all("div",class_='book-block-outer')
for book_title in all_book:
title = book_title.div['data-product-title']
price = book_title.div['data-product-price']
category = book_title.div['data-product-category']
print(title)
print("Rs:-"+ price)
print(category)
and below is the output -
Learn Algorithms and Data Structures in Java for Day-to-Day Applications [Video]
Rs:-199.44
Application Development
Traceback (most recent call last):
File "/home/bhagwatanimesh/PycharmProjects/packet_pub/packet_pub", line 17, in
title = book_title.div['data-product-title']
File "/home/bhagwatanimesh/.local/lib/python3.5/site-packages/bs4/element.py", line 1011, in getitem
return self.attrs[key]
KeyError: 'data-product-title'

It seems like, You are trying to access a key which is not present in the dictionary.
For solving this you may use below code.
for book_title in all_book:
try:
title = book_title.div['data-product-title']
price = book_title.div['data-product-price']
category = book_title.div['data-product-category']
print(title)
print("Rs:-"+ price)
print(category)
except:
continue

Related

Word search with BeautifulSoup

I'm trying to scrape this news website "https://inshorts.com/en/read/national" and I'm fetching results of articles with heads Headline and news. I need all the articles on the pages which contain a specific word (e.g., "health") and add "date" on the head.
Here's my code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
# code for scraping the first page
d={'headlines':[],'news':[], 'date':[]}
r = requests.get("https://inshorts.com/en/read/national")
soup = BeautifulSoup(r.content, 'html.parser')
min_news_id = soup.findAll("script",{"type":"text/javascript"})[2].text
min_news_id = min_news_id[25:35]
soup=soup.findAll("div",{"class":"news-card z-depth-1"})
#to search specific word in the content
soup = soup.find_all(text=re.compile("Health"))
for data in soup:
d['headlines'].append(data.find(itemprop="headline").getText())
d['news'].append(data.find(itemprop="articleBody").getText())
d['date'].append(data.find(itemprop="date").getText())
# code for scraping more pages
for i in tqdm(range(10)):
# It uses JavaScript to load more data from
# https://inshorts.com/en/ajax/more_news using POST requests
# with parameter 'news_offset' which informs server what page
# it has to send to client.
# we can make POST requests with this parameter to get new
# data in JSON format
try:
params = {'news_offset': min_news_id}
req = requests.post("https://inshorts.com/en/ajax/more_news",data=params)
#In JSON you have HTML in json_data['html'] and
#json_data['min_news_id'] for next page
json_data = req.json()
min_news_id = json_data['min_news_id']
soup = BeautifulSoup(json_data['html'], 'html.parser')
soup=soup.findAll("div",{"class":"news-card z-depth-1"})
for data in soup:
d['headlines'].append(data.find(itemprop="headline").getText())
d['news'].append(data.find(itemprop="articleBody").getText())
d['date'].append(data.find(itemprop="date").getText())
except:
pass
# storing the data into .csv file
df = pd.DataFrame(d)
df.to_csv("inshorts_news.csv", index=False)
And here's the error:
AttributeError Traceback (most recent call last)
<ipython-input-2-2d109f9dfc91> in <module>()
12
13 #to search specific word in the content
---> 14 soup = soup.find_all(text=re.compile("Health"))
15
16 for data in soup:
/usr/local/lib/python3.7/dist-packages/bs4/element.py in __getattr__(self, key)
1882 def __getattr__(self, key):
1883 raise AttributeError(
-> 1884 "ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key
1885 )
AttributeError: ResultSet object has no attribute 'find_all'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?
What happens?
As the error tells you are trying to find_all() on a ResultSet object, that wont work.
How to fix?
Iterate over the elements of the object and check there for your keyword:
for data in soup.select('div.news-card.z-depth-1'):
if data.find(text=re.compile("farmer")):
Example
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import re
# code for scraping the first page
d=[]
r = requests.get("https://inshorts.com/en/read/national")
soup = BeautifulSoup(r.content, 'html.parser')
min_news_id = soup.findAll("script",{"type":"text/javascript"})[2].text
min_news_id = min_news_id[25:35]
# code for scraping more pages
for i in tqdm(range(2)):
try:
params = {'news_offset': min_news_id}
req = requests.post("https://inshorts.com/en/ajax/more_news",data=params)
json_data = req.json()
min_news_id = json_data['min_news_id']
soup = BeautifulSoup(json_data['html'], 'html.parser')
for data in soup.select('div.news-card.z-depth-1'):
if data.find(text=re.compile("farmer")):
d.append({
'headline': data.find(itemprop="headline").getText(),
'article': data.find(itemprop="articleBody").getText()
})
except Exception as e:
print (e)
pd.DataFrame(d)
Output
headline article
0 Heavy traffic seen on DND Flyway at Noida toll... Heavy traffic was witnessed on Delhi Noida Dir...
1 Farmers take out protest march in Haryana over... Farmers have taken out a protest march in Hary...
2 Akhilesh Yadav detained in Lucknow after sit-i... Samajwadi Party President Akhilesh Yadav was d...
3 Priyanka detained on way to UP's Lakhimpur Khe... Congress leader Priyanka Gandhi Vadra was deta...
4 Rakesh Tikait reaches UP's Lakhimpur Kheri aft... BKU leader Rakesh Tikait reached UP's Lakhimpu...
5 Opposition to start with 'Photo Ops' in Lakhim... Uttar Pradesh Cabinet Minister Sidharth Nath S...

BS4: AttributeError: 'NoneType' object has no attribute 'text'

I encounter an issue while trying to scrape a certain job-posting website. First, my urls are in a CSV file "urls.csv"
Usually the code runs fine, but from time to time I am getting this error: "AttributeError: 'NoneType' object has no attribute 'text'", sometime after 1 iteration, sometimes after 30. And if the issue was with let's say i=230, if I run it again it parses that url fine, and stops again after some iterations.
Can someone advise please?
Thank you!
Also, the error occurs on line textoffer = ......
Edit: Link to the csv: https://github.com/DonCheiron/Scraping-Be.Indeed/blob/master/urls.csv
import bs4 as bs
import urllib.request
import csv
with open('C:/Users/******/Desktop/urls.csv', 'r') as f:
reader = csv.reader(f)
pages = list(reader)
for i in range (0,300):
page = ''.join(map(str, pages[i]))
print('Working on ' + str(i)+ "...")
sauce = urllib.request.urlopen(page).read()
soup =bs.BeautifulSoup(sauce,'lxml')
textoffer = soup.body.div.find('div',class_='jobsearch-JobComponent-description icl-u-xs-mt--md').text
file = open(str(i)+ '.txt','w')
file.write(textoffer)
file.close()
print(str(i) + " Done!")
Using a few random urls that you supplied, I try:
with open('urls.csv', 'r') as f:
reader = csv.reader(f)
pages = list(reader)
for counter, url in enumerate(pages):
print(counter, ''.join(url))
page_response = requests.get(''.join(url))
print(page_response)
soup = BeautifulSoup(page_response.content, 'html.parser')
print(soup.body.div.find('div',class_='jobsearch-JobComponent-description icl-u-xs-mt--md')).text
output:
0 https://be.indeed.com/rc/clk?jk=39582947a2d91970&fccid=adb55a49f6636f0e&vjs=3
<Response [200]>
None
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-511-2b829cd9fc45> in <module>()
4 print(page_response)
5 soup = BeautifulSoup(page_response.content, 'html.parser')
----> 6 print(soup.body.div.find('div',class_='jobsearch-JobComponent-description icl-u-xs-mt--md')).text
7
8
AttributeError: 'NoneType' object has no attribute 'text'
Traceback is pretty clear in showing you that trying to convert the find into text when there isn't anything found is a problem. As to why the same url would only sometimes have this class, it is either not the same url or a dynamic page which doesn't always contain the same elements.

Python beautiful soup code worked yesterday but gives an error message today

I made a pretty basic web scraper using Python 3.6 that is designed to take a list of urls stored within a csv document and return information. Yesterday it was working.
Today, it's no longer working, even with the previously used csv of URLs.Instead I get an error message.
Here is the code that I am working with:
import pandas as pd
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen
import time
dataset = pd.read_csv('read_csv.csv')
dataset = dataset.iloc[:, 0].str.strip('[]')
data = []
for i in dataset:
page = urlopen(i)
soup = bs(page, 'html.parser', time.sleep(1))
title = soup.find(attrs = {'class': 'title'})
title = title.text.strip()
content = soup.find(attrs = {'class': 'articleContent articleTruncate'}, itemprop = 'text')
content = content.text.strip()
date = soup.find(attrs = {'class': 'date'})
date = date.text.strip()
author = soup.find(attrs = {'class': 'authorInfo'})
author = author.text.strip()
data.append((title, date, author, content))
Here is the console error message:
Traceback (most recent call last):
File "<ipython-input-26-3a1fc158da11>", line 6, in <module>
title = title.text.strip()
AttributeError: 'NoneType' object has no attribute 'text'

How to increase the request page time in python 3 while scraping web pages?

I have started scraping reviews from e-commerce platform and perform sentiment analysis and share it with people on my blog to make the life of people easier and understand everything about the product in just one article.
I am using python packages like selenium and bs4. Here is my code:
from selenium import webdriver
from selenium.webdriver.common.by import By
from contextlib import closing
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import requests
import re
from bs4 import BeautifulSoup
def remove_non_ascii_1(text):
return ''.join([i if ord(i) < 128 else ' ' for i in text])
with closing(Firefox()) as browser:
site = "https://www.flipkart.com/honor-8-pro-midnight-black-128-gb/product-reviews/itmeymafrghbjcpf?page=1&pid=MOBEWXHMVYBBMZGJ"
browser.get(site)
file = open("review.txt", "w")
for count in range(1, 100):
nav_btns = browser.find_elements_by_class_name('_33m_Yg')
button = ""
for btn in nav_btns:
number = int(btn.text)
if(number==count):
button = btn
break
button.send_keys(Keys.RETURN)
WebDriverWait(browser, timeout=10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "_2xg6Ul")))
read_more_btns = browser.find_elements_by_class_name('_1EPkIx')
for rm in read_more_btns:
browser.execute_script("return arguments[0].scrollIntoView();", rm)
browser.execute_script("window.scrollBy(0, -150);")
rm.click()
page_source = browser.page_source
soup = BeautifulSoup(page_source, "lxml")
ans = soup.find_all("div", class_="_3DCdKt")
for tag in ans:
title = str(tag.find("p", class_="_2xg6Ul").string).replace(u"\u2018", "'").replace(u"\u2019", "'")
title = remove_non_ascii_1(title)
title.encode('ascii','ignore')
content = tag.find("div", class_="qwjRop").div.prettify().replace(u"\u2018", "'").replace(u"\u2019", "'")
content = remove_non_ascii_1(content)
content.encode('ascii','ignore')
content = content[15:-7]
votes = tag.find_all("span", class_="_1_BQL8")
upvotes = int(votes[0].string)
downvotes = int(votes[1].string)
file.write("Review Title : %s\n\n" % title )
file.write("Upvotes : " + str(upvotes) + "\n\nDownvotes : " + str(downvotes) + "\n\n")
file.write("Review Content :\n%s\n\n\n\n" % content )
file.close()
The code is working fine on platform like Amazon, but on Flipkart, after crawling 14 pages I get an error saying "Someting is Wrong!!!" and the crawling stops.
In command line I get this error:
C:\Users\prate\Desktop\Crawler\Git_Crawler\New>python scrape.py
Traceback (most recent call last):
File "scrape.py", line 37, in
WebDriverWait(browser, timeout=10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "_2xg6Ul")))
File "C:\Users\prate\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\support\wait.py", line 80, in until
raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
There is no message printed as well. I think if I increase the request time interval on the platform it might let me crawl.
What should I do?
The error says it all :
C:\Users\prate\Desktop\Crawler\Git_Crawler\New>python scrape.py Traceback (most recent call last): File "scrape.py", line 37, in WebDriverWait(browser, timeout=10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "_2xg6Ul"))) File "C:\Users\prate\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\support\wait.py", line 80, in until raise TimeoutException(message, screen, stacktrace) selenium.common.exceptions.TimeoutException: Message:
If you look at the API Docs of the expected_conditions clause presence_of_all_elements_located(locator) it is defined as :
An expectation for checking that there is at least one element present on a web page. locator is used to find the element returns the list of WebElements once they are located
Now, if you browse to the intended webpage :
https://www.flipkart.com/honor-8-pro-midnight-black-128-gb/product-reviews/itmeymafrghbjcpf?page=1&pid=MOBEWXHMVYBBMZGJ
You will find the webpage have no products or reviews and the Locator Strategy which you have adapted as (By.CLASS_NAME, "_2xg6Ul") doesn't identifies any element on the webpage.
Hence even though the synchronization time elapses, no webelements are added to the list and selenium.common.exceptions.TimeoutException is raised.
As you mentioned The code is working fine on platform like Amazon it is worth to mention that the website https://www.flipkart.com is ReactJS based and may differ from website to website

FileNotFoundError while scraping images

I've written this script to download images from a subreddit.
# A script to download pictures from reddit.com/r/HistoryPorn
from urllib.request import urlopen
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
import re
import os
import sys #TODO: sys.argv
print('Downloading images...')
# Create a directory for photographs
path_to_hist = '/home/tautvydas/Documents/histphoto'
os.chdir(path_to_hist)
if not os.path.exists('/home/tautvydas/Documents/histphoto'):
os.mkdir(path_to_hist)
website = 'https://www.reddit.com/r/HistoryPorn'
# Go to the internet and connect to the subreddit, start a loop
for i in range(3):
subreddit = urlopen(website)
bs_subreddit = BeautifulSoup(subreddit, 'lxml')
# Create a regex and find all the titles in the page
remove_reddit_tag = re.compile('(\s*\(i.redd.it\)(\s*))')
title_bs_subreddit = bs_subreddit.findAll('p', {'class': 'title'})
# Get text off the page
pic_name = []
for item in title_bs_subreddit[1:]:
item = item.get_text()
item = remove_reddit_tag.sub('', item)
pic_name.append(item)
# Get picture links
pic_bs_subreddit = bs_subreddit.findAll('div', {'data-url' : re.compile('.*')})
pic_img = []
for pic in pic_bs_subreddit[1:]:
pic_img.append(pic['data-url'])
# Zip all info into one
name_link = zip(pic_name, pic_img)
for i in name_link:
urlretrieve(i[1],i[0])
# Click next
for link in bs_subreddit.find('span', {'class' : 'next-button'}).children:
website = link['href']
However I get this FileNotFoundError.
Downloading images...
Traceback (most recent call last):
File "gethist.py", line 44, in <module>
urlretrieve(i[1],i[0])
File "/home/tautvydas/anaconda3/lib/python3.6/urllib/request.py", line 258, in urlretrieve
tfp = open(filename, 'wb')
FileNotFoundError: [Errno 2] No such file or directory: 'Preparation of rocket carrying test instruments, Kauai. June 29, 1962 [2880x1620] https://www.topic.com/a-crimson-fracture-in-the-sky'
What could be the problem? The link in 'data-url' is retrieved fine and works if clicked. Could this be a problem that a name contains a hyperlink? Or the name too long? Because up till that image all other images are downloaded without any issues.
The issue here is related to the names collected : they contain the source of the picture as an url string, and it is misinterpreted like a folder path.
You would need to clean the text to avoid special annoying characters and maybe make them a bit shorter, but i suggest to change the pattern too, to ensure the results, you could parse only the <a> tags that contain the title, not the whole <p> which hold the link too.
Also, instead of building a zip with two different loops, you can create one list of the main blocks by searching the class thing (equivalent to findAll('div', {'data-url' : re.compile('.*')), and then use this list to perform relative queries on each block to find the title and the url.
[...]
remove_reddit_tag = re.compile('(\s*\(i.redd.it\)(\s*))')
name_link = []
for block in bs_subreddit.findAll('div', {'class': 'thing'})[1:]:
item = block.find('a',{'class': 'title'}).get_text()
title = remove_reddit_tag.sub('', item)[:100]
url = block.get('data-url')
name_link.append((title, url))
print(url, title)
for title, url in name_link:
urlretrieve(url, title)

Resources