Trouble reading documents within links - python-3.x

I can't seem to figure out how to read the text inside this link which is a pdf document. I do not get any errors nor do I get back any type of text.
from io import BytesIO
from urllib.request import urlopen
import PyPDF2
def read_pdf_from_url(url):
try:
response = urlopen(url)
pdf_file = BytesIO(response.read())
reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
text += page.extract_text()
return text
except Exception as e:
print(f"An error occurred: {e}")
url = 'https://probaterecords.shelbyal.com/shelby/search.do?indexName=shelbyimages&lq=Instrument%3A19890308000066440&page=1&view=FitV&scrollbar=0&navpanes=0&statusbar=0&messages=0?iframe=true&width=50%25&height=95%25'
text = read_pdf_from_url(url)
print(text)

Related

difficulty reading online xml content with python, xml.etree.ElementTree and urllib

I am reading XML online in an RSS feed using python, xml.etree.ElementTree and urllib.
My code seems to be straightforward but is not giving me the results that I want
No matter what I do it always returns what looks like all the data in the XML stream
I am open to better suggestions on how to read specific strings into lists
see my code below
import xml.etree.ElementTree as ET
from urllib import request
title_list = []
def main():
try:
response = request.urlopen("https://www.abcdefghijkl.xml")
rsp_code = response.code
print(rsp_code)
if rsp_code == 200:
webdata = response.read()
print("1")
xml = webdata.decode('UTF-8')
print("2")
tree = ET.parse(xml)
print("3")
items = tree.findall('channel')
print("4")
for item in items:
title = item.find('title').text
title_list.append(title)
print(f"title_list 0 is, {title_list}")
print("5")
except Exception as e:
print(f'An error occurred {str(e)}')
main()
Thanks, everyone, I figured it out after an awesome Udemy video. I eventually used the bs4 library(beautiful soup)python library and requests. Heres the code below
import bs4
import requests
title_list = []
def main():
try:
result = requests.get("https://abcdefghijk.xml")
res_text = result.text
soup = bs4.BeautifulSoup(res_text, features="xml")
title_tag_list = soup.select('title')
for titles in title_tag_list:
title = titles.text
title_list.append(title)
print(f"title_list 0 is, {title_list}")
print("5")
except Exception as e:
print(f'An error occurred {str(e)}')
main()

Python web scraper won't save image files

I started working on a small image scraping terminal program that is supposed to save images to a specified file within the program hierarchy. This comes from a basic tutorial I found online. However, whenever I enter in a search term into the terminal to start scraping bing.com (yeah, i know), the program crashes. The errors i get seem to focus on either the image file type not being recognized or the file path where the images will be saved is not being recognized:
from bs4 import BeautifulSoup
import requests
from PIL import Image
from io import BytesIO
search = input("Search for:")
params = {"q": search}
r = requests.get("http://www.bing.com/images/search", params=params)
soup = BeautifulSoup(r.text, "html.parser")
links = soup.findAll("a", {"class": "thumb"})
for item in links:
img_obj = requests.get(item.attrs["href"])
print("Getting", item.attrs["href"])
title = item.attrs["href"].split("/")[-1]
img = Image.open(BytesIO(img_obj.content))
img.save("./scraped_images/" + title, img.format)
Error thrown: Exception has occurred: FileNotFoundError
[Errno 2] No such file or directory: './scraped_images/3849747391_4a7dc3f19e_b.jpg'
I've tried adding a file path variable (using pathlib) and concatenating that with with the other necessary variables:
from bs4 import BeautifulSoup
import requests
from PIL import Image
from io import BytesIO
from pathlib import Path
image_folder = Path("./scraped_images/")
search = input("Search for:")
params = {"q": search}
r = requests.get("http://www.bing.com/images/search", params=params)
soup = BeautifulSoup(r.text, "html.parser")
links = soup.findAll("a", {"class": "thumb"})
for item in links:
img_obj = requests.get(item.attrs["href"])
print("Getting", item.attrs["href"])
title = item.attrs["href"].split("/")[-1]
img = Image.open(BytesIO(img_obj.content))
img.save(image_folder + title, img.format)
Error thrown: Exception has occurred: TypeError
unsupported operand type(s) for +: 'WindowsPath' and 'str'
I've checked the documentation for PIL, BeautifulSoup, etc. to see if any updates may have been screwing me up, i've checked the elements on bing to see if the classes are correct, and even tried searching by different class and nothing worked. I'm at a loss. Any thoughts or guidance is appreciated. Thanks!
I have changed your code a bit:
from bs4 import BeautifulSoup
import requests
from pathlib import Path
import os
image_folder = Path("./scraped_images/")
if not os.path.isdir(image_folder):
print('Making %s'%(image_folder))
os.mkdir(image_folder)
search = input("Search for:")
params = {"q": search}
r = requests.get("http://www.bing.com/images/search", params=params)
soup = BeautifulSoup(r.text, "html.parser")
links = soup.findAll("a", {"class": "thumb"})
for item in links:
img_obj = requests.get(item.attrs["href"])
print("Getting", item.attrs["href"])
title = item.attrs["href"].split("/")[-1]
if img_obj.ok:
with open('%s/%s'%(image_folder, title), 'wb') as file:
file.write(img_obj.content)
You can use PIL but in this case you do not need it.
I also improved the code with PIL to work better:
from bs4 import BeautifulSoup
import requests
from PIL import Image
from io import BytesIO
from pathlib import Path
s = requests.Session()
image_folder = Path("./scraped_images/")
search = input("Search for:")
params = {"q": search}
r = s.get("http://www.bing.com/images/search", params=params)
soup = BeautifulSoup(r.text, "html.parser")
links = soup.findAll("a", {"class": "thumb"})
for item in links:
try:
img_obj = s.get(item.attrs["href"], headers={'User-Agent': 'User-Agent: Mozilla/5.0'})
if img_obj.ok:
print("Getting", item.attrs["href"])
title = item.attrs["href"].split("/")[-1]
if '?' in title:
title = title.split('?')[0]
img = Image.open(BytesIO(img_obj.content))
img.save(str(image_folder) + '/' + title, img.format)
else:
continue
except OSError:
print('\nError downloading %s try to visit'
'\n%s\n'
'manually and try to get the image manually.\n' %(title, item.attrs["href"]))
I use a requests session and added try and except if PIL can't make the image. I also only make try to make a image if the request get a 200 response from the site.

Problems to display Web Scraping values in Tkinter

I am trying to display an image inside a Tkinter window through Web Scraping displaying a message from a certain point on the website that says in Portuguese "Capella Ganhou" or "Procyon Ganhou".
I tried to look in different forums for a solution but I couldn't find any that fixes the error in my situation. I also tested with print to make sure the string exists and is returning its value and also encapsulated it into a variable.
The source code and the error are below .
from tkinter import *
import urllib.request
from bs4 import BeautifulSoup
from IPython.display import Image, display
from PIL import Image, ImageTk
import io
def capella_tg():
resultado_tg = StringVar()
resultado_tg.set(soup.find_all("font")[5].string)
label_resultado_tg = Label(root, textvariable=resultado_tg).pack()
def procyon_tg():
resultado_tg = StringVar()
resultado_tg.set(soup.find_all("font")[4].string[3:])
label_resultado_tg = Label(root, textvariable=resultado_tg).pack()
def img_capella():
raw_data = urllib.request.urlopen("https://i.imgur.com/AHLqtt0.jpg").read()
im = Image.open(io.BytesIO(raw_data))
image = ImageTk.PhotoImage(im)
label1 = Label(root, image=image).pack()
def img_procyon():
raw_data = urllib.request.urlopen("https://i.imgur.com/TQyCnfD.jpg").read()
im = Image.open(io.BytesIO(raw_data))
image = ImageTk.PhotoImage(im)
label1 = Label(root, image=image).pack()
root = Tk()
with urllib.request.urlopen("http://www.cabaleasy.com") as url: page = url.read()
soup = BeautifulSoup(page, "html.parser")
#print(soup.find_all("font")[5].string)
try:
capella_tg()
except:
procyon_tg()
if capella_tg():
img_capella()
elif procyon_tg():
img_procyon()
root.mainloop()
-------ERROR---------
Traceback (most recent call last):
File "C:/Users/LucasDEV/PycharmProjects/LucasDEV/WEB_SCRAPPING/TESTES.py", line 49, in <module>
elif procyon_tg():
File "C:/Users/LucasDEV/PycharmProjects/LucasDEV/WEB_SCRAPPING/TESTES.py", line 17, in procyon_tg
resultado_tg.set(soup.find_all("font")[4].string[3:])
TypeError: 'NoneType' object is not subscriptable
#Dipen Shah That is why I used the try/except structure for. But it's not working and I'm getting the same error. I also tried this way:
def capella_tg():
resultado_tg = StringVar()
try:
resultado_tg.set(soup.find_all("font")[5].string)
label_resultado_tg = Label(root, textvariable=resultado_tg).pack()
except:
pass
i tested just using text and it works:
import urllib.request
from bs4 import BeautifulSoup
with urllib.request.urlopen("http://www.cabaleasy.com") as url: page = url.read()
soup = BeautifulSoup(page, "html.parser")
try:
print(soup.find_all("font")[5].string)
#At this moment, it will not be showing, cuz the status changes every hour
except:
print(soup.find_all("font")[4].string[3:])

Python Web Scrape Unknown Number of Pages

I have working code that scrapes a single Craigslist page for specific information, but what would I need to add in order to grab the data from ALL of the pages (not knowing how many pages ahead of time)?
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url="https://portland.craigslist.org/search/sss?query=electronics&sort=date"
uClient=uReq(my_url) #sends GET request to URL
page_html=uClient.read() #reads returned data and puts it in a variable
uClient.close() #close the connection
#create a file that we will want later to write parsed data to
filename="ScrapedData.csv"
f=open(filename, 'w')
headers="date, location, title, price\n"
f.write(headers)
#use BS to parse the webpage
page_soup=soup(page_html,'html.parser') #applying BS to the obtained html
containers=page_soup.findAll('p',{'class','result-info'})
for container in containers:
container_date=container.findAll('time',{'class','result-date'})
date=container_date[0].text
try:
container_location=container.findAll('span',{'class','result-hood'})
location=container_location[0].text
except:
try:
container_location=container.findAll('span',{'class','nearby'})
location=container_location[0].text
except:
location='NULL'
container_title=container.findAll('a',{'class','result-title'})
title=container_title[0].text
try:
container_price=container.findAll('span',{'class','result-price'})
price=container_price[0].text
except:
price='NULL'
#to print to screen
print('date:'+date)
print('location:'+location)
print('title:'+title)
print('price:'+price)
#to write to csv
f.write(date+','+location.replace(",","-")+','+title.replace(","," ")+','+price+'\n')
f.close()
Apart from what sir Andersson has already shown, you can do that as well for this site:
import requests
from bs4 import BeautifulSoup
import csv
page_link = "https://portland.craigslist.org/search/sss?s={}&query=electronics&sort=date"
for link in [page_link.format(page) for page in range(0,1147,120)]: #this is the fix
res = requests.get(link)
soup = BeautifulSoup(res.text,'lxml')
for container in soup.select('.result-info'):
try:
date = container.select('.result-date')[0].text
except IndexError:
date = ""
try:
title = container.select('.result-title')[0].text
except IndexError:
title = ""
try:
price = container.select('.result-price')[0].text
except IndexError:
price = ""
print(date,title,price)
with open("craigs_item.csv","a",newline="",encoding="utf-8") as outfile:
writer = csv.writer(outfile)
writer.writerow([date,title,price])
You can try to loop through all pages by handling "s" parameter in URL until you find page with no results (page with text "search and you will find"):
import requests
results_counter = 0
while True:
my_url="https://portland.craigslist.org/search/sss?query=electronics&sort=date&s=%d" % results_counter
page_html = requests.get(my_url).text
if "search and you will find" in page_html:
break
else:
results_counter += 120
filename="ScrapedData.csv"
f=open(filename, 'w')
headers="date, location, title, price\n"
f.write(headers)
page_soup=soup(page_html,'html.parser') #applying BS to the obtained html
containers=page_soup.findAll('p',{'class','result-info'})
...

Scraping a forum: cannot scrape posts that have a table within them

I've almost finished writing my first scraper!
I've run into a snag, however: I can't seem to grab the contents of posts that contain a table (posts that cite another post, in other words).
This is the code that extracts post contents from the soup object. It works just fine:
def getPost_contents(soup0bj):
try:
soup0bj = (soup0bj)
post_contents = []
for content in soup0bj.findAll('', {'class' : 'post_content'}, recursive = 'True'):
post_contents.append(content.text.strip())
...#Error management
return (post_contents)
Here's an example of what I need to scrape (highlighted in yellow):
Problem post
(URL, just in case: http://forum.doctissimo.fr/sante/diabete/savoir-diabetique-sujet_170840_1.htm#t657906)
How do I get the contents that I've highlighted? And why does my current getPostcontents function not work in this particular instance? As far as I can see, the strings are still under div class=post_contents.
EDIT EDIT EDIT
This is how I am getting my BeautifulSoup:
from bs4 import BeautifulSoup as Soup
def getHTMLsoup(url):
try:
html = urlopen(url)
...#Error management
try:
soup0bj = Soup(html.read().decode('utf-8', 'replace'))
time.sleep(5)
...#Error management
return (soup0bj)
EDIT2 EDIT2 EDIT2
These are the relevant bits of the scraper: (Sorry about the dump!)
from bs4 import BeautifulSoup as Soup
from urllib.request import urlopen, urlretrieve
from urllib.error import HTTPError, URLError
import time, re
def getHTMLsoup(url):
try:
html = urlopen(url)
except HTTPError as e:
return None
print('The server hosting{} is unavailable.'.format(url), '\n')
print('Trying again in 10 minutes...','\n')
time.sleep(600)
getHTMLsoup(url)
except URLError as e:
return None
print('The webpage found at {} is unavailable.'.format(url),'\n')
print('Trying again in 10 minutes...','\n')
time.sleep(600)
getHTMLsoup(url)
try:
soup0bj = Soup(html.read().decode('utf-8', 'replace'))
time.sleep(5)
except AttributeError as e:
return None
print("Ooops, {}'s HTML structure wasn't detected.".format(url),'\n')
return soup0bj
def getMessagetable(soup0bj):
try:
soup0bj = (soup0bj)
messagetable = []
for data in soup0bj.findAll('tr', {'class' : re.compile('message.*')}, recursive = 'True'):
except AttributeError as e:
print(' ')
return (messagetable)
def getTime_stamps(soup0bj):
try:
soup0bj = (soup0bj)
time_stamps = []
for stamp in soup0bj.findAll('span', {'class' : 'topic_posted'}):
time_stamps.append(re.search('..\/..\/20..', stamp.text).group(0))
except AttributeError as e:
print('No time-stamps found. Moving on.','\n')
return (time_stamps)
def getHandles(soup0bj):
try:
soup0bj = (soup0bj)
handles = []
for handle in soup0bj.findAll('span', {'data-id_user' : re.compile('.*')}, limit = 1):
handles.append(handle.text)
except AttributeError as e:
print("")
return (handles)
def getPost_contents(soup0bj):
try:
soup0bj = (soup0bj)
post_contents = []
for content in soup0bj.findAll('div', {'class' : 'post_content'}, recursive = 'True'):
post_contents.append(content.text.strip())
except AttributeError as e:
print('Ooops, something has gone wrong!')
return (post_contents)
html = ('http://forum.doctissimo.fr/sante/diabete/savoir-diabetique-sujet_170840_1.htm')
for soup in getHTMLsoup(html):
for messagetable in getMessagetable(soup):
print(getTime_stamps(messagetable),'\n')
print(getHandles(messagetable),'\n')
print(getPost_contents(messagetable),'\n')
The problem is your decoding, it is not utf-8, if you remove the "replace" your code will error with:
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 253835: invalid continuation byte
The data seems to be latin-1 encoded, decoding to latin-1 causes no errors but the output does look off in certain parts, using.
html = urlopen(r).read().decode("latin-1")
will work but as I mentioned, you get weird output like:
"diabète en cas d'accident de la route ou malaise isolÊ ou autre ???"
Another option would be to pass an accept-charset header:
from urllib.request import Request, urlopen
headers = {"accept-charset":"utf-8"}
r = Request("http://forum.doctissimo.fr/sante/diabete/savoir-diabetique-sujet_170840_1.htm#t657906",headers=headers)
html = urlopen(r).read()
I get the exact same encoding issue using requests letting it handle the encoding, it is like the data has mixed encoding, some utf-8 and some latin-1. The headers returned from requests show the content-encoding as gzip as :
'Content-Encoding': 'gzip'
if we specify we want gzip and decode:
from urllib.request import Request, urlopen
headers = {"Accept-Encoding":"gzip"}
r = Request("http://forum.doctissimo.fr/sante/diabete/savoir-diabetique-sujet_170840_1.htm#t657906",headers=headers)
r = urlopen(r)
import gzip
gzipFile = gzip.GzipFile(fileobj=r)
print(gzipFile.read().decode("latin-1"))
We get the same errors with utf-8 and the same weird output decoding to latin-1. Interestingly in python2 both requests and urllib both work fine.
Using chardet:
r = urlopen(r)
import chardet
print(chardet.detect(r.read()))
reckons with around 71 percent confidence that it is ISO-8859-2 but that again gives the same bad output.
{'confidence': 0.711104254322944, 'encoding': 'ISO-8859-2'}

Resources