Problem To Scraping Wikipedia Images Via Python - python-3.x

I wrote a Program With Python For Scraping First Image Link of a query in WikipediaSomethings Like This Image:
My Python Program Require These Below Libraries:
requests
bs4
html
re
When I Run My Code then I give an argument it Returns a Defined Error('Image-Not-Found'). please Help Me To solve the problem.
My Python Program Source code:
import requests
import bs4
import re
import html
# Create the parser
my_parser = argparse.ArgumentParser(description='Wikipedia Image Grabber')
# Add the arguments
my_parser.add_argument('Phrase',
metavar='Phrase',
type=str,
help='Phrase to Search')
# Execute the parse_args() method
args = my_parser.parse_args()
Phrase = args._get_kwargs()[0][1]
if '.' in Phrase or '-' in Phrase:
if '.' in Phrase and '-' in Phrase:
Phrase = str(Phrase).replace('-',' ')
elif '-' in Phrase and not '.' in Phrase:
Phrase = str(Phrase).replace('-',' ')
Phrase = html.escape(Phrase)
request = requests.get('https://fa.wikipedia.org/wiki/Special:Search?search=%s&go=Go&ns0=1' % Phrase).text
parser = bs4.BeautifulSoup(request, 'html.parser')
none_search_finder = parser.find_all('p', attrs = {'class':'mw-search-nonefound'})
if len(none_search_finder)==1:
print('No-Result')
exit()
else:
search_results = parser.find_all('div' , attrs = {'class':'mw-search-result-heading'})
if len(search_results)==0:
search_result = parser.find_all('h1', attrs = {'id':'firstHeading'})
if len(search_result)!=0:
link = 'https://fa.wikipedia.org/wiki/'+str(Phrase)
else:
print('Result-Error')
exit()
else:
selected_result = search_results[0]
regex_exp = r".*<a href=\"(.*)\" title="
regex_get_uri = re.findall(regex_exp, str(selected_result))
regex_result = str(regex_get_uri[0])
link = 'https://fa.wikipedia.org'+regex_result
#---------------
second_request = requests.get(link)
second_request_source = second_request.text
second_request_parser = bs4.BeautifulSoup(second_request_source, 'html.parser')
image_finder = second_request_parser.find_all('a', attrs = {'class':'image'})
if len(image_finder) == 0:
print('No-Image')
exit()
else:
image_finder_e = image_finder[0]
second_regex = r".*src=\"(.*)\".*decoding=\"async\""
regex_finder = re.findall(second_regex, str(image_finder_e))
if len(regex_finder)!=0:
regexed_uri = str(regex_finder[0])
img_link = regexed_uri.replace('//','https://')
print(img_link)
else:
print("Image-Not-Found")

You can do it without regex and the reason your code is not working is that on browser and on response the decoding = "async" position is not same.
here is a solution without regex.
import re
import requests
from bs4 import BeautifulSoup
url = 'https://en.wikipedia.org/wiki/Google'
soup = BeautifulSoup(requests.get(url).text,'html.parser')
imglinks = soup.find_all('a', attrs = {'class':'image'})[0]
for img in imglinks.find_all('img'):
print(img['src'].replace('//','https://'))
Output:
https://upload.wikimedia.org/wikipedia/commons/thumb/2/2f/Google_2015_logo.svg/196px-Google_2015_logo.svg.png

Related

data scraping from multiple pages

Below code works for single page scraping from olx, need help to extract from multiple pages.
import requests
import re
from django.conf.urls import url, include
url_base = "https://www.olx.in"
url = url_base + "/hyderabad_g4058526/cars_c5"
info_labels = ("itemDetails","itemPrice", "itemTitle", "item-location")
info_pattern = r'(?s)(.?)'
link_pattern = r'(?s)?data-aut-id="itemBox".*?href="([^"]+?)"'
response = requests.get(url)
cars = list(zip( *(re.findall(info_pattern.format(label), response.text) for label in info_labels),
(url_base + path for path in re.findall(link_pattern, response.text)) ))
print(cars)
with open('cars_olx.txt', 'w', encoding='utf-8') as f:
for item in cars:
f.write(u"%s\n" % str(item))

Trouble printing assignment

I have gotten a code and after working out the indentation problem in it, it runs without errors, however now I cannot print the code into a list.
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import requests
symbol = 'AAPL'
url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=" + symbol + "&type=&dateb=&owner=exclude&start=0&count=100&output=atom"
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
html = soup(page_html, 'html.parser')
entries = html.findAll("entry")
shouldContinue = True
link = ""
for entry in entries:
if shouldContinue and (
entry.find("category")["term"].lower() == "10-k" or entry.find("category")["term"].lower() == "10-q" or
entry.find("category")["term"].lower() == "20-f"):
firstUrl = entry.find("link")["href"]
uClientFirstUrl = uReq(firstUrl)
page_html_firstUrl = uClientFirstUrl.read()
uClientFirstUrl.close()
htmlFirstUrl = soup(page_html_firstUrl, 'html.parser')
tds = htmlFirstUrl.findAll("table")[1].findAll("td")
foundtd = False
for td in tds:
if foundtd == True:
link = "https://www.sec.gov" + td.find("a")["href"]
foundtd = False
if "xbrl instance" in td.text.lower():
foundtd = True
shouldContinue = False
def getCash(url, symbol):
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
xml = soup(page_html, 'xml')
cash = xml.findAll("us-gaap:CashAndCashEquivalentsAtCarryingValue")
if len(cash) == 0:
cash = xml.findAll("ifrs-full:Cash")
if len(cash) == 0:
cash = xml.findAll("us-gaap:CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents")
if len(cash) == 0:
cash = xml.findAll("us-gaap:Cash")
return cash
print(getCash)
getCash(url, symbol)
I have tried printing the assignment, as well as calling the method without any success. A sense of direction would be appreciated. Thank you.
As mentioned in my comment above:
What effect do you expect from print(getCash)? If you want it to print the return from the getCash() function, delete it (it's not doing anything), and wrap your getCash(url, symbol) call in a print() function.
Basically, do this:
print(getCash(url, symbol))

How can I make the code follow another link after the first one?

I am trying to find the link at position 3 of the url (the first name is 1). Follow that link. Repeat this process 4 times. At the end I want to print the final link.
The issue that I am running into is getting the url to repeat. Please give me some advice how I can make this code run.
import urllib.request, urllib.parse, urllib.error
import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
import re
lst = list()
lst2 = list()
count = 0
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = 'http://py4e-data.dr-chuck.net/known_by_Fikret.html'
count = int(input('Enter Count '))
position = int(input('Enter Position ')) -1
while count >= 0:
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")
tags = soup('a')
for tag in tags:
values = tag.get('href', None)
values = str(values)
lst.append(values)
count = count - 1
lst2.append(lst[position:position+1])
url = lst2[0]
url = str(url)
print(re.findall('http.+html',url))
lst.clear()
lst2.clear()
return url
If I am parsing your question correctly, one way to do this (I'll leave error checking as an exercise for you; also code has not been run) is like this:
# Loop count times; "_" effectively means ignore the counter
for _ in range(count):
# Get an array of <a> elements,
# then get the (position-1)th
# then get the text of the 'href' tag
next_url = soup.find_all('a')[position-1]['href']
# And repeat for the URL found there
html = urlopen(next_url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")
# Finally, print out the (position-1)th URL on the last page
print(soup.find_all('a')[position-1]['href'])
Of course, if there are not enough links on the page, or there are <a> tags without an href, or the href URL is malformed the program will crash.
I was able to answer my own question after playing with the code for a little while longer. I am sure there is a much more eloquent solution, but I am really happy I finally got it to run correctly.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
import re
lst = list()
count = 0
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = 'http://py4e-data.dr-chuck.net/known_by_Fikret.html'
count = int(input('Enter Count '))
position = int(input('Enter Position ')) -1
while True:
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")
tags = soup('a')
for tag in tags:
values = tag.get('href', None)
values = str(values)
lst.append(values)
url = str(lst[position:position+1])
url = url[2:-2]
print(url)
lst.clear()
count = count -1
if count == 0:break

Web Scraping - iterating over

I'm looking to scrape a web-site hotel platform for reviews.
I cannot figure out two things:
1 - Why I cannot extract all reviews at one time? Say there are 14 reviews, I retrieve only 7 of them or so. I assume there is restriction by the server hosting the website?
2 - When I iterate over the object review_list the children objects that are retrieved are the same each time - i.e I retrieve the same review_item. Instead of iterating through the various objects the are tag li and of class review_item (see second code snippet).
I'm running Python 3.7 and an example url is:
url example
Hope you can shed some light here.
Thanks!
Code Snippet 1:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import json
import re
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")#For ignoring SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE# url = input('Enter url - ' )
url=input("Enter Url - ")
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
html = soup.prettify("utf-8")
hotel_json_details = {}
hotel_json = {}
for line in soup.find_all('script',attrs={"type" : "application/ld+json"}):
details = line.text.strip()
details = json.loads(details)
hotel_json_details["name"] = details["name"]
hotel_json_details["aggregateRating"]={}
hotel_json_details["aggregateRating"]["ratingValue"]=details["aggregateRating"]["ratingValue"]
hotel_json_details["aggregateRating"]["reviewCount"]=details["aggregateRating"]["reviewCount"]
hotel_json_details["address"]={}
hotel_json_details["address"]["Street"]=details["address"]["streetAddress"]
hotel_json_details["address"]["Locality"]=details["address"]["addressLocality"]
hotel_json_details["address"]["Region"]=details["address"]["addressRegion"]
hotel_json_details["address"]["Zip"]=details["address"]["postalCode"]
hotel_json_details["address"]["Country"]=details["address"]["addressCountry"]
print(hotel_json_details)
div = soup.find_all(['li'],attrs={"class" : "review_item"})
print(div)
Code Snippet 2:
hotel_reviews= []
for line in soup.find_all('li', class_='review_item'):
review={}
review["review_metadata"]={}
review["review"]={}
review["review_metadata"]["review_date"] = soup.find('p', class_='review_item_date').text.strip()
review["review_metadata"]["review_staydate"] = soup.find('p', class_='review_staydate').text.strip()
review["review_metadata"]["reviewer_name"] = soup.find('p', class_='reviewer_name').text.strip()
review["review_metadata"]["reviewer_country"] = soup.find('span', class_='reviewer_country').text.strip()
review["review_metadata"]["reviewer_score"] = soup.find('span', class_='review-score-badge').text.strip()
review["review"]["review_pos"] = soup.find('p', class_='review_pos').text.strip()
review["review"]["review_neg"] = soup.find('p', class_='review_neg').text.strip()
scoreword = soup.find('span', class_='review_item_header_scoreword')
if scoreword != None :
review["review_metadata"]["review_header"] = scoreword.text.strip()
else:
review["review_metadata"]["review_header"] = ""
hotel_reviews.append(x)
print(hotel_reviews)
When you are iterating over the review items, you need to use line.find() instead of soup.find(). This way, you'll be looking for review fields inside every review container as opposed to searching the whole HTML tree:
for line in soup.find_all('li', class_='review_item'):
review = {"review_metadata": {}, "review": {}}
review["review_metadata"]["review_date"] = line.find('p', class_='review_item_date').text.strip()
# ^ HERE

Can't print tag 'content' anymore

I had a perfectly well working scraper for TripAdvisor, it met all my needs, then I tried to use it after a four day break and something went wrong, I quickly realized that TA had changed some of the tags, I made the appropriate changes and I still couldn't get it working as before. I want to grab the value of the 'content' tag within an element.
This is the element:
<div class="prw_rup prw_common_bubble_rating bubble_rating" data-prwidget-init="" data-prwidget-name="common_bubble_rating"><span alt="5 of 5 bubbles" class="ui_bubble_rating bubble_50" content="5" property="ratingValue" style="font-size:18px;"></span></div>
and here is the code:
for bubs in data.findAll('div',{'class':"prw_rup prw_common_bubble_rating bubble_rating"}):
print([img["content"] for img in bubs.select("img[content]")])
but now it only gives me an empty '[]' instead of the content which is '5'.
Anybody know what may have changed?
here is the rest of my code
import urllib
import urllib.request
from bs4 import BeautifulSoup
import re
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
theurl = "https://www.tripadvisor.com/Hotels-g147364-c3-Cayman_Islands-Hotels.html"
thepage = urllib
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, "html.parser")
base_url = "https://www.tripadvisor.com"
urls = []
init_info = []
init_data = open('/Users/paribaker/Desktop/scrapping/TripAdvisor/Inv/speccaydata.txt', 'w')
for link in soup.findAll('a',href=re.compile('/Hotel_Review')):
urls.append(base_url + (link.get('href')).strip("#REVIEWS"))
def remove_duplicates(urls):
output= []
seen = set()
for line in urls:
if line not in seen:
output.append(line)
seen.add(line)
return output
urls2 = remove_duplicates(urls)
for url in urls2:
try:
driver = webdriver.Chrome()
driver.get(url)
element = driver.find_element_by_id("taplc_prodp13n_hr_sur_review_filter_controls_0_filterLang_ALL").click()
print("succesfull")
moreinfo = driver.page_source
moresoup = BeautifulSoup(moreinfo,"html.parser")
driver.close()
#moreinfo = urllib
#moreinfo = urllib.request.urlopen(url)
#moresoup = BeautifulSoup(moreinfo,"html.parser")
except:
print("none")
for data in moresoup.findAll('div', {"class":"heading_2014 hr_heading"}):
try:
for title in data.findAll('h1',{'id':"HEADING"}):
init_info.append(title.text.strip("\n")+ ",\t")
for add_data in data.findAll('span',{'class':'format_address'}):
print((add_data.find('span',{'class':'street-address'}).text +",\t"))
init_info.append(add_data.find('span',{'class':'street-address'}).text +",\t")
init_info.append(add_data.find('span',{'class':'locality'}).text + ",\t")
init_info.append(add_data.find('span',{'class':'country-name'}).text + ",\t")
for reviews in data.findAll('a',{'class':'more taLnk'}):
init_info.append(reviews.text).strip("\n")
init_info.append(", \t")
#init_info.append([img["alt"] for img in stars.select("img[alt]")])
#init_info.append([img["content"] for img in stars.select("img[content]")])
except :
init_info.append("N/A" + ", /t")
The element with the content="5" attribute is a span, not an img.
Does this get what you want?
for bubs in data.findAll('div',{'class':"prw_rup prw_common_bubble_rating bubble_rating"}):
print([elem["content"] for elem in bubs.select("span[content]")])

Resources