I can't seem to figure out how to read the text inside this link which is a pdf document. I do not get any errors nor do I get back any type of text.
from io import BytesIO
from urllib.request import urlopen
import PyPDF2
def read_pdf_from_url(url):
try:
response = urlopen(url)
pdf_file = BytesIO(response.read())
reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
text += page.extract_text()
return text
except Exception as e:
print(f"An error occurred: {e}")
url = 'https://probaterecords.shelbyal.com/shelby/search.do?indexName=shelbyimages&lq=Instrument%3A19890308000066440&page=1&view=FitV&scrollbar=0&navpanes=0&statusbar=0&messages=0?iframe=true&width=50%25&height=95%25'
text = read_pdf_from_url(url)
print(text)
I'm in the process of learning python3 and I try to solve a simple task. I want to get the name of account and the date of post from instagram link.
import requests
from bs4 import BeautifulSoup
html = requests.get('https://www.instagram.com/p/BuPSnoTlvTR')
soup = BeautifulSoup(html.text, 'lxml')
item = soup.select_one("meta[property='og:description']")
name = item.find_previous_sibling().get("content").split("•")[0]
print(name)
This code works sometimes with links like this https://www.instagram.com/kingtop
But I need it to work also with post of image like this https://www.instagram.com/p/BuxB00KFI-x/
That's all what I could make, but this is not working. And I can't get the date also.
Do you have any ideas? I appreciate any help.
I found a way to get the name of account. Now I'm trying to find a way to get an upload date
import requests
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
import time
from multiprocessing import Pool
from requests.exceptions import HTTPError
start = time.time()
file = open('users.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()
for url in urls:
url = url.strip ('\n')
try:
req = requests.get(url)
req.raise_for_status()
except HTTPError as http_err:
output = open('output2.txt', 'a')
output.write(f'не найдена\n')
except Exception as err:
output = open('output2.txt', 'a')
output.write(f'не найдены\n')
else:
output = open('output2.txt', 'a')
soup = BeautifulSoup(req.text, "lxml")
the_url = soup.select("[rel='canonical']")[0]['href']
the_url2=the_url.replace('https://www.instagram.com/','')
head, sep, tail = the_url2.partition('/')
output.write (head+'\n')
I've almost finished writing my first scraper!
I've run into a snag, however: I can't seem to grab the contents of posts that contain a table (posts that cite another post, in other words).
This is the code that extracts post contents from the soup object. It works just fine:
def getPost_contents(soup0bj):
try:
soup0bj = (soup0bj)
post_contents = []
for content in soup0bj.findAll('', {'class' : 'post_content'}, recursive = 'True'):
post_contents.append(content.text.strip())
...#Error management
return (post_contents)
Here's an example of what I need to scrape (highlighted in yellow):
Problem post
(URL, just in case: http://forum.doctissimo.fr/sante/diabete/savoir-diabetique-sujet_170840_1.htm#t657906)
How do I get the contents that I've highlighted? And why does my current getPostcontents function not work in this particular instance? As far as I can see, the strings are still under div class=post_contents.
EDIT EDIT EDIT
This is how I am getting my BeautifulSoup:
from bs4 import BeautifulSoup as Soup
def getHTMLsoup(url):
try:
html = urlopen(url)
...#Error management
try:
soup0bj = Soup(html.read().decode('utf-8', 'replace'))
time.sleep(5)
...#Error management
return (soup0bj)
EDIT2 EDIT2 EDIT2
These are the relevant bits of the scraper: (Sorry about the dump!)
from bs4 import BeautifulSoup as Soup
from urllib.request import urlopen, urlretrieve
from urllib.error import HTTPError, URLError
import time, re
def getHTMLsoup(url):
try:
html = urlopen(url)
except HTTPError as e:
return None
print('The server hosting{} is unavailable.'.format(url), '\n')
print('Trying again in 10 minutes...','\n')
time.sleep(600)
getHTMLsoup(url)
except URLError as e:
return None
print('The webpage found at {} is unavailable.'.format(url),'\n')
print('Trying again in 10 minutes...','\n')
time.sleep(600)
getHTMLsoup(url)
try:
soup0bj = Soup(html.read().decode('utf-8', 'replace'))
time.sleep(5)
except AttributeError as e:
return None
print("Ooops, {}'s HTML structure wasn't detected.".format(url),'\n')
return soup0bj
def getMessagetable(soup0bj):
try:
soup0bj = (soup0bj)
messagetable = []
for data in soup0bj.findAll('tr', {'class' : re.compile('message.*')}, recursive = 'True'):
except AttributeError as e:
print(' ')
return (messagetable)
def getTime_stamps(soup0bj):
try:
soup0bj = (soup0bj)
time_stamps = []
for stamp in soup0bj.findAll('span', {'class' : 'topic_posted'}):
time_stamps.append(re.search('..\/..\/20..', stamp.text).group(0))
except AttributeError as e:
print('No time-stamps found. Moving on.','\n')
return (time_stamps)
def getHandles(soup0bj):
try:
soup0bj = (soup0bj)
handles = []
for handle in soup0bj.findAll('span', {'data-id_user' : re.compile('.*')}, limit = 1):
handles.append(handle.text)
except AttributeError as e:
print("")
return (handles)
def getPost_contents(soup0bj):
try:
soup0bj = (soup0bj)
post_contents = []
for content in soup0bj.findAll('div', {'class' : 'post_content'}, recursive = 'True'):
post_contents.append(content.text.strip())
except AttributeError as e:
print('Ooops, something has gone wrong!')
return (post_contents)
html = ('http://forum.doctissimo.fr/sante/diabete/savoir-diabetique-sujet_170840_1.htm')
for soup in getHTMLsoup(html):
for messagetable in getMessagetable(soup):
print(getTime_stamps(messagetable),'\n')
print(getHandles(messagetable),'\n')
print(getPost_contents(messagetable),'\n')
The problem is your decoding, it is not utf-8, if you remove the "replace" your code will error with:
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 253835: invalid continuation byte
The data seems to be latin-1 encoded, decoding to latin-1 causes no errors but the output does look off in certain parts, using.
html = urlopen(r).read().decode("latin-1")
will work but as I mentioned, you get weird output like:
"diabète en cas d'accident de la route ou malaise isolÊ ou autre ???"
Another option would be to pass an accept-charset header:
from urllib.request import Request, urlopen
headers = {"accept-charset":"utf-8"}
r = Request("http://forum.doctissimo.fr/sante/diabete/savoir-diabetique-sujet_170840_1.htm#t657906",headers=headers)
html = urlopen(r).read()
I get the exact same encoding issue using requests letting it handle the encoding, it is like the data has mixed encoding, some utf-8 and some latin-1. The headers returned from requests show the content-encoding as gzip as :
'Content-Encoding': 'gzip'
if we specify we want gzip and decode:
from urllib.request import Request, urlopen
headers = {"Accept-Encoding":"gzip"}
r = Request("http://forum.doctissimo.fr/sante/diabete/savoir-diabetique-sujet_170840_1.htm#t657906",headers=headers)
r = urlopen(r)
import gzip
gzipFile = gzip.GzipFile(fileobj=r)
print(gzipFile.read().decode("latin-1"))
We get the same errors with utf-8 and the same weird output decoding to latin-1. Interestingly in python2 both requests and urllib both work fine.
Using chardet:
r = urlopen(r)
import chardet
print(chardet.detect(r.read()))
reckons with around 71 percent confidence that it is ISO-8859-2 but that again gives the same bad output.
{'confidence': 0.711104254322944, 'encoding': 'ISO-8859-2'}
I have the following block of code that when executed on my Ubuntu computer using Python3 hits the recursion error for pickling. I don't understand why since the object to be pickled is not particularly complex and doesn't involve any custom objects. In fact, it is only a list of some 500 elements (approximately); each element of the list is just a string. It seems to me that I should be able to serialize this object without issue. Why am I hitting a recursion limit error? I know I could up the recursion limit with import sys and sys.setrecursionlimit() but I am frankly surprised I have to do that for such a trivial object.
from urllib import request
from bs4 import BeautifulSoup
import pickle
def get_standard_and_poors_500_constituents():
# URL request, URL opener, read content.
req = request.Request(
"http://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
)
opener = request.urlopen(req)
# Convert bytes to UTF-8.
content = opener.read().decode()
soup = BeautifulSoup(content, "lxml")
# HTML table we actually need is the first.
tables = soup.find_all("table")
external_class = tables[0].findAll("a", {"class":"external text"})
c = [ext.string for ext in external_class if not "reports" in ext]
return c
sp500_constituents = get_standard_and_poors_500_constituents()
spdr_etf = "SPY"
sp500_index = "^GSPC"
def main():
import datetime as dt
today = dt.datetime.today().date()
fname = "sp500_constituents_" + str(today) + ".pkl"
with open(fname, "wb") as f:
pickle.dump(sp500_constituents, f)
if __name__ == "__main__":
main()
I am running python3 on a Ubuntu machine and have noticed that the following block of code is fickle. Sometimes it runs just fine, other times it produces a segmentation fault. I don't understand why. Can someone explain what might be going on?
Basically what the code does is try to read S&P companies from Wikipedia and write the list of tickers to a file in the same directory as the script. If no connection to Wikipedia can be established, the script tries instead to read an existing list from file.
from urllib import request
from urllib.error import URLError
from bs4 import BeautifulSoup
import os
import pickle
import dateutil.relativedelta as dr
import sys
sys.setrecursionlimit(100000)
def get_standard_and_poors_500_constituents():
fname = (
os.path.abspath(os.path.dirname(__file__)) + "/sp500_constituents.pkl"
)
try:
# URL request, URL opener, read content.
req = request.Request(
"http://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
)
opener = request.urlopen(req)
# Convert bytes to UTF-8.
content = opener.read().decode()
soup = BeautifulSoup(content, "lxml")
# HTML table we actually need is the first.
tables = soup.find_all("table")
external_class = tables[0].findAll("a", {"class":"external text"})
c = [ext.string for ext in external_class if not "reports" in ext]
with open(fname, "wb") as f:
pickle.dump(c, f)
except URLError:
with open(fname, "rb") as f:
c = pickle.load(f)
finally:
return c
sp500_constituents = get_standard_and_poors_500_constituents()
spdr_etf = "SPY"
sp500_index = "^GSPC"
def main():
X = get_standard_and_poors_500_constituents()
print(X)
if __name__ == "__main__":
main()