Why is being indent's wrong causes wrong function? - python-3.x

I cannot understand why this error happen.
First,I wrote
import urllib.request
from bs4 import BeautifulSoup
import time
import os
def download_image(url,name):
path = "./scrape_image/"
imagename = str(name) + ".jpg"
if not os.path.exists(path):
os.makedirs(path)
print(path)
urllib.request.urlretrieve(url,path+imagename)
url = "https://api.XXXkeyword=YYY&limit=1000"
response = urllib.request.urlopen(url)
rss = response.read().decode("utf-8")
soup = BeautifulSoup(rss, "xml")
name=0
for s in soup.find_all("photo"):
url = s.find_all("image_url")[0].string
name+=1
download_image(url, name)
by running this code,I can get 1 image from the API.But originally the right code can get 1000 images from the API.I fixed indents in the first code, so my code is like
import urllib.request
from bs4 import BeautifulSoup
import time
import os
def download_image(url,name):
path = "./image/"
imagename = str(name) + ".jpg"
if not os.path.exists(path):
os.makedirs(path)
print(path)
urllib.request.urlretrieve(url, path+imagename)
time.sleep(1)
url = "https://api.XXXkeyword=YYY&limit=1000"
response = urllib.request.urlopen(url)
rss = response.read().decode("utf-8")
soup = BeautifulSoup(rss, "xml")
name = 0
for s in soup.find_all("photo"):
url = s.find_all("image_url")[0].string
name+=1
download_image(url,name)
At last,I can get 1000 images from the API. But I cannot understand why I can do so by fixing the indent.Please give me some explanations.

Because in the first example you're only getting the image if your condition passes:
if not os.path.exists(path):
And that condition will only pass once because you immediately create the path:
os.makedirs(path)
For every other iteration of the loop, the condition is false. So the code within the conditional block doesn't execute.
Basically, an if block only executes if the condition is true. When you move your code out of the if block, it always executes regardless of the condition.

Related

BeautifulSoup Assignment got error module 'urllib' has no attribute 'urlopen' Can anyone provide solutions for this?

I am trying to do an assignment: write a Python program that expands on http://www.py4e.com/code3/urllinks.py. The program will use urllib to read the HTML from the data files below, extract the href= vaues from the anchor tags, scan for a tag that is in a particular position relative to the first name in the list, follow that link and repeat the process a number of times and report the last name you find.
Actual problem: Start at: http://py4e-data.dr-chuck.net/known_by_Kylen.html
Find the link at position 18 (the first name is 1). Follow that link. Repeat this process 7 times. The answer is the last name that you retrieve.
Hint: The first character of the name of the last page that you will load is: P[enter image description here][1]
#Code I used:
import re
import urllib
import urllib.request
import urllib.parse
import urllib.error
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter URL:')
count = int(input('Enter count:'))
position = int(input('Enter position:'))-1
html = urllib.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html,"html.parser")
href = soup('a')
#print href
for i in range(count):
link = href[position].get('href', None)
print(href[position].contents[0])
html = urllib.urlopen(link).read()
soup = BeautifulSoup(html,"html.parser")
href = soup('a')
But got an error: html = urllib.urlopen(url, context=ctx).read()
AttributeError: module 'urllib' has no attribute 'urlopen'
Can anyone provide solutions for this?
You imported urlopen already, but never used it. Instead you used urllib.urlopen which doesn't exist.
Instead of using urllib.urlopen just use urlopen
Example:
from urllib.request import urlopen
# before: html = urllib.urlopen(url, context=ctx).read()
html = urlopen(url, context=ctx).read()

Get the name of Instagram profile and the date of post with Python

I'm in the process of learning python3 and I try to solve a simple task. I want to get the name of account and the date of post from instagram link.
import requests
from bs4 import BeautifulSoup
html = requests.get('https://www.instagram.com/p/BuPSnoTlvTR')
soup = BeautifulSoup(html.text, 'lxml')
item = soup.select_one("meta[property='og:description']")
name = item.find_previous_sibling().get("content").split("•")[0]
print(name)
This code works sometimes with links like this https://www.instagram.com/kingtop
But I need it to work also with post of image like this https://www.instagram.com/p/BuxB00KFI-x/
That's all what I could make, but this is not working. And I can't get the date also.
Do you have any ideas? I appreciate any help.
I found a way to get the name of account. Now I'm trying to find a way to get an upload date
import requests
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
import time
from multiprocessing import Pool
from requests.exceptions import HTTPError
start = time.time()
file = open('users.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()
for url in urls:
url = url.strip ('\n')
try:
req = requests.get(url)
req.raise_for_status()
except HTTPError as http_err:
output = open('output2.txt', 'a')
output.write(f'не найдена\n')
except Exception as err:
output = open('output2.txt', 'a')
output.write(f'не найдены\n')
else:
output = open('output2.txt', 'a')
soup = BeautifulSoup(req.text, "lxml")
the_url = soup.select("[rel='canonical']")[0]['href']
the_url2=the_url.replace('https://www.instagram.com/','')
head, sep, tail = the_url2.partition('/')
output.write (head+'\n')

How to scrape all the test match details in cricinfo

I am trying to scrape all the test match details but it is showing HTTP Error 504: Gateway Timeout I am getting the details of test matches but it is not showing this is my code i have used bs4 to scrape the test match details from cricinfo
I need to scrape the details of 2000 test matches this is my code
import urllib.request as req
BASE_URL = 'http://www.espncricinfo.com'
if not os.path.exists('./espncricinfo-fc'):
os.mkdir('./espncricinfo-fc')
for i in range(0, 2000):
soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=test;all=1;page=' + str(i)).read())
time.sleep(1)
for new_host in soupy.findAll('a', {'class' : 'srchPlyrNmTxt'}):
try:
new_host = new_host['href']
except:
continue
odiurl =BASE_URL + urljoin(BASE_URL,new_host)
new_host = unicodedata.normalize('NFKD', new_host).encode('ascii','ignore')
print(new_host)
html = req.urlopen(odiurl).read()
if html:
with open('espncricinfo-fc/{0!s}'.format(str.split(new_host, "/")[4]), "wb") as f:
f.write(html)
print(html)
else:
print("no html")
this is usually happen when doing multiple request too fast, it can be the server is down or your connection blocked by server firewall, try increase your sleep() or add random sleep.
import random
.....
for i in range(0, 2000):
soupy = BeautifulSoup(....)
time.sleep(random.randint(2,6))
not sure why, seems to be working for me.
I made a few changes in the loop through the links. I'm not sure how you're wanting the output to look in terms of writing it to your file, so I left that part alone. But like I said, seems to be working ok on my end.
import bs4
import requests
import os
import time
import urllib.request as req
BASE_URL = 'http://www.espncricinfo.com'
if not os.path.exists('C:/espncricinfo-fc'):
os.mkdir('C:/espncricinfo-fc')
for i in range(0, 2000):
i=0
url = 'http://search.espncricinfo.com/ci/content/match/search.html?search=test;all=1;page=%s' %i
html = requests.get(url)
print ('Checking page %s of 2000' %(i+1))
soupy = bs4.BeautifulSoup(html.text, 'html.parser')
time.sleep(1)
for new_host in soupy.findAll('a', {'class' : 'srchPlyrNmTxt'}):
try:
new_host = new_host['href']
except:
continue
odiurl = BASE_URL + new_host
new_host = odiurl
print(new_host)
html = req.urlopen(odiurl).read()
if html:
with open('C:/espncricinfo-fc/{0!s}'.format('_'.join(str.split(new_host, "/")[4:])), "wb") as f:
f.write(html)
#print(html)
else:
print("no html")

Can't print tag 'content' anymore

I had a perfectly well working scraper for TripAdvisor, it met all my needs, then I tried to use it after a four day break and something went wrong, I quickly realized that TA had changed some of the tags, I made the appropriate changes and I still couldn't get it working as before. I want to grab the value of the 'content' tag within an element.
This is the element:
<div class="prw_rup prw_common_bubble_rating bubble_rating" data-prwidget-init="" data-prwidget-name="common_bubble_rating"><span alt="5 of 5 bubbles" class="ui_bubble_rating bubble_50" content="5" property="ratingValue" style="font-size:18px;"></span></div>
and here is the code:
for bubs in data.findAll('div',{'class':"prw_rup prw_common_bubble_rating bubble_rating"}):
print([img["content"] for img in bubs.select("img[content]")])
but now it only gives me an empty '[]' instead of the content which is '5'.
Anybody know what may have changed?
here is the rest of my code
import urllib
import urllib.request
from bs4 import BeautifulSoup
import re
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
theurl = "https://www.tripadvisor.com/Hotels-g147364-c3-Cayman_Islands-Hotels.html"
thepage = urllib
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, "html.parser")
base_url = "https://www.tripadvisor.com"
urls = []
init_info = []
init_data = open('/Users/paribaker/Desktop/scrapping/TripAdvisor/Inv/speccaydata.txt', 'w')
for link in soup.findAll('a',href=re.compile('/Hotel_Review')):
urls.append(base_url + (link.get('href')).strip("#REVIEWS"))
def remove_duplicates(urls):
output= []
seen = set()
for line in urls:
if line not in seen:
output.append(line)
seen.add(line)
return output
urls2 = remove_duplicates(urls)
for url in urls2:
try:
driver = webdriver.Chrome()
driver.get(url)
element = driver.find_element_by_id("taplc_prodp13n_hr_sur_review_filter_controls_0_filterLang_ALL").click()
print("succesfull")
moreinfo = driver.page_source
moresoup = BeautifulSoup(moreinfo,"html.parser")
driver.close()
#moreinfo = urllib
#moreinfo = urllib.request.urlopen(url)
#moresoup = BeautifulSoup(moreinfo,"html.parser")
except:
print("none")
for data in moresoup.findAll('div', {"class":"heading_2014 hr_heading"}):
try:
for title in data.findAll('h1',{'id':"HEADING"}):
init_info.append(title.text.strip("\n")+ ",\t")
for add_data in data.findAll('span',{'class':'format_address'}):
print((add_data.find('span',{'class':'street-address'}).text +",\t"))
init_info.append(add_data.find('span',{'class':'street-address'}).text +",\t")
init_info.append(add_data.find('span',{'class':'locality'}).text + ",\t")
init_info.append(add_data.find('span',{'class':'country-name'}).text + ",\t")
for reviews in data.findAll('a',{'class':'more taLnk'}):
init_info.append(reviews.text).strip("\n")
init_info.append(", \t")
#init_info.append([img["alt"] for img in stars.select("img[alt]")])
#init_info.append([img["content"] for img in stars.select("img[content]")])
except :
init_info.append("N/A" + ", /t")
The element with the content="5" attribute is a span, not an img.
Does this get what you want?
for bubs in data.findAll('div',{'class':"prw_rup prw_common_bubble_rating bubble_rating"}):
print([elem["content"] for elem in bubs.select("span[content]")])

Python3 Pickle hitting recursion limit

I have the following block of code that when executed on my Ubuntu computer using Python3 hits the recursion error for pickling. I don't understand why since the object to be pickled is not particularly complex and doesn't involve any custom objects. In fact, it is only a list of some 500 elements (approximately); each element of the list is just a string. It seems to me that I should be able to serialize this object without issue. Why am I hitting a recursion limit error? I know I could up the recursion limit with import sys and sys.setrecursionlimit() but I am frankly surprised I have to do that for such a trivial object.
from urllib import request
from bs4 import BeautifulSoup
import pickle
def get_standard_and_poors_500_constituents():
# URL request, URL opener, read content.
req = request.Request(
"http://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
)
opener = request.urlopen(req)
# Convert bytes to UTF-8.
content = opener.read().decode()
soup = BeautifulSoup(content, "lxml")
# HTML table we actually need is the first.
tables = soup.find_all("table")
external_class = tables[0].findAll("a", {"class":"external text"})
c = [ext.string for ext in external_class if not "reports" in ext]
return c
sp500_constituents = get_standard_and_poors_500_constituents()
spdr_etf = "SPY"
sp500_index = "^GSPC"
def main():
import datetime as dt
today = dt.datetime.today().date()
fname = "sp500_constituents_" + str(today) + ".pkl"
with open(fname, "wb") as f:
pickle.dump(sp500_constituents, f)
if __name__ == "__main__":
main()

Resources