How to display proper output when using re.findall() in python? - python-3.x

I made a python script to get the latest stock price from the Yahoo Finance.
import urllib.request
import re
htmlfile = urllib.request.urlopen("http://finance.yahoo.com/q?s=GOOG");
htmltext = htmlfile.read();
price = re.findall(b'<span id="yfs_l84_goog">(.+?)</span>',htmltext);
print(price);
It works smoothly but when I output the price it comes out like this[b'1,217.04']
This might be a petty issue to ask but I'm new to python scripting so please help me if you can.
I want to get rid of 'b'. If I remove 'b' from b'<span id="yfs_l84_goog">"then it shows this error.
File "C:\Python33\lib\re.py", line 201, in findall
return _compile(pattern, flags).findall(string)
TypeError: can't use a string pattern on a bytes-like object
I want the output to be just
1,217.04

b'' is a syntax for bytes literals in Python. It is how you could define byte sequences in Python source code.
What you see in the output is the representation of the single bytes object inside the price list returned by re.findall(). You could decode it into a string and print it:
>>> for item in price:
... print(item.decode()) # assume utf-8
...
1,217.04
You could also write bytes directly to stdout e.g., sys.stdout.buffer.write(price[0]).
You could use an html parser instead of a regex to parse html:
#!/usr/bin/env python3
import cgi
from html.parser import HTMLParser
from urllib.request import urlopen
url = 'http://finance.yahoo.com/q?s=GOOG'
def is_price_tag(tag, attrs):
return tag == 'span' and dict(attrs).get('id') == 'yfs_l84_goog'
class Parser(HTMLParser):
"""Extract tag's text content from html."""
def __init__(self, html, starttag_callback):
HTMLParser.__init__(self)
self.contents = []
self.intag = None
self.starttag_callback = starttag_callback
self.feed(html)
def handle_starttag(self, tag, attrs):
self.intag = self.starttag_callback(tag, attrs)
def handle_endtag(self, tag):
self.intag = False
def handle_data(self, data):
if self.intag:
self.contents.append(data)
# download and convert to Unicode
response = urlopen(url)
_, params = cgi.parse_header(response.headers.get('Content-Type', ''))
html = response.read().decode(params['charset'])
# parse html (extract text from the price tag)
content = Parser(html, is_price_tag).contents[0]
print(content)
Check whether yahoo provides API that doesn't require web scraping.

Okay after searching for a while. I found the solution.Works fine for me.
import urllib.request
import re
htmlfile = urllib.request.urlopen("http://finance.yahoo.com/q?s=GOOG");
htmltext = htmlfile.read();
pattern = re.compile('<span id="yfs_l84_goog">(.+?)</span>');
price = pattern.findall(str(htmltext));
print(price);

Related

python doesn't get page content

I've got the Python Beautifulsoup script below (adapted to python 3 from that script ).
It executes fine but nothing is returned in cmd.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
Newlines = re.compile(r'[\r\n]\s+')
def getPageText(url):
# given a url, get page content
data = urlopen(url).read()
# parse as html structured document
soup = BeautifulSoup(data, 'html.parser')
# kill javascript content
for s in soup.findAll('script'):
s.replaceWith('')
# find body and extract text
txt = soup.find('body').getText('\n')
# remove multiple linebreaks and whitespace
return Newlines.sub('\n', txt)
def main():
urls = [
'http://www.stackoverflow.com/questions/5331266/python-easiest-way-to-scrape-text-from-list-of-urls-using-beautifulsoup',
'http://stackoverflow.com/questions/5330248/how-to-rewrite-a-recursive-function-to-use-a-loop-instead'
]
txt = [getPageText(url) for url in urls]
if __name__=="__main__":
main()
Here's my cmd output
Microsoft Windows [Version 10.0..]
(c) Microsoft Corporation. All rights reserved.
C:\Users\user\Desktop\urls>python urls.py
C:\Users\user\Desktop\urls>
Why doesn't it return the pages contents?
Nothing is returned in cmd because there is no print statement in the code.
if you want to print out all the texts parsed from the given URL just use print function in main() function
def main():
urls = [
'http://www.stackoverflow.com/questions/5331266/python-easiest-way-to-scrape-text-from-list-of-urls-using-beautifulsoup',
'http://stackoverflow.com/questions/5330248/how-to-rewrite-a-recursive-function-to-use-a-loop-instead'
]
txt = [getPageText(url) for url in URLs]
for t in txt:
print(t)

How to use BeautifulSoup to retrieve the URL of a tarfile

The following webpage contains all the source code URLs for the LFS project:
https://linuxfromscratch.org/lfs/view/systemd/chapter03/packages.html
I've wriiten some python3 code to retrieve all these URLs from that page:
#!/usr/bin/env python3
from requests import get
from bs4 import BeautifulSoup
import re
import sys, os
#url=sys.argv[1]
url="https://linuxfromscratch.org/lfs/view/systemd/chapter03/packages.html"
exts = (".xz", ".bz2", ".gz", ".lzma", ".tgz", ".zip")
response = get(url)
soup = BeautifulSoup(response.content, 'html.parser')
for link in soup.find_all('a', href=True):
if link.get('href'):
for anhref in link.get('href').split():
if os.path.splitext(anhref)[-1] in exts:
print((link.get('href')))
What I would like to do is input a pattern, say:
pattern = 'iproute2'
and then print the line that contains the iproute2 tarfile
which happens to be:
https://www.kernel.org/pub/linux/utils/net/iproute2/iproute2-5.12.0.tar.xz
I've tried using match = re.search(pattern, text) and it finds the correct line but if I print match I get:
<re.Match object; span=(43, 51), match='iproute2'>
How do I get it to print the actual URL?
You can use .string property (returns the string passed into the function).
Code Example
txt="https://www.kernel.org/pub/linux/utils/net/iproute2/iproute2-5.12.0.tar.xz"
pattern = 'iproute2'
match = re.search(pattern, txt)
if match: # this condition is used to avoid NoneType error
print(match.string)
else:
print('No Match Found')

Python - nested lists q

Pasted my code below. I'm trying to isolate the "name" value of a list inside the nested list the api replies with but its not detected as a nested list for some reason. My response or what the decode variable prints is:
[{"id":"a981fdae11a949b2bb3c78cc2c7f820d","name":"Juice"},{"id":"ec07182556b444d38e9af48bf74bde82","name":"Dog"},{"id":"2aca11f3fa364adf83183e10e8794c46","name":"csheim","legacy":true}]
From the above output I need to isolate / print the name "csheim" as its sub list contains "legacy : true" but as I said above, I get this when even try to detect sub/nested lists inside my list so I don't know how to go about this. Error: print(decode[1][1])
IndexError: string index out of range
import requests
import time
import socket
import json
import string
host_ip = socket.gethostname()
print("")
print(socket.gethostbyname(host_ip))
print("")
time.sleep(2)
payload = ["juice", "csheim", "dog"]
r = requests.post('https://api.example.com', json=payload)
decode = r.text
print(decode)
print(decode[1][1])
You should parse it with json library , then treat that like an array of dictionaries.
Try this :
import requests
import time
import socket
import json
import string
host_ip = socket.gethostname()
print("")
print(socket.gethostbyname(host_ip))
print("")
time.sleep(2)
payload = ["juice", "csheim", "dog"]
r = requests.post('https://api.example.com', json=payload)
decode = json.loads(r.text)
for item in decode :
if "legacy" in item :
print(item["name"])

How to extract a table element from html source of a web page using selenium phantomJS in python 3?

I am doing a web crawler project which is supposed to take two dates as input (like 2019-03-01 and 2019-03-05) and then attaches every day between these two dates at the end of a base link (for example base link + date is https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/2019-1-3). I want to extract a table with "tablesaw-sortable" class_name in web_page source and save that in text file or any other file format alike.
I developed this code:
from datetime import timedelta, date
from bs4 import BeautifulSoup
import urllib.request
from selenium import webdriver
class webcrawler():
def __init__(self, st_date, end_date):
self.base_url = 'https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/'
self.st_date = st_date
self.end_date = end_date
def date_list(self):
return [str(date1 + timedelta(n)) for n in range(int ((self.end_date - self.st_date).days)+1)]
def create_link(self, attachment):
url = str(self.base_url)
url += attachment
return url
def open_link(self, link):
driver = webdriver.PhantomJS()
driver.get(link)
html = driver.page_source
return html
def extract_table(self, html):
soup = BeautifulSoup(html)
print(soup.prettify())
def output_to_csv(self):
pass
date1 = date(2018, 3, 1)
date2 = date(2019, 3, 5)
test = webcrawler(st_date=date1, end_date=date2)
date_list = test.date_list()
link = test.create_link(date_list[0])
html = test.open_link(link)
test.extract_table(html)
The problem is that it takes so long for me to wait for getting page.source of just one link. I already used urllib.request but the problem with that method is that sometimes it gets the html content without waiting for the table to be fully loaded.
How can I speed up the process and just extract the mentioned table and access its html source and don't wait for the rest. I just want the information in table rows to be saved in some text file for each date.
Can anybody help me to deal with the problem?
There are quite a few notable things wrong with this code and how you are using the libraries. Let me try to fix it up.
First, I don't see you using the urllib.request library. You can remove this, or if you are using it in another spot in your code, I recommend the highly appraised requests module. I also recommend using the requests library instead of selenium, if you are only trying to get the HTML source from a site, as selenium is more designed towards navigating sites and acting as a 'real' person.
You can use response = requests.get('https://your.url.here') and then response.text to get the returned HTML.
Next I noticed in the open_link() method, you are creating a new instance of the PhantomJS class each time you call the method. This is really inefficient as selenium uses a lot of resources (and takes a long time, depending on the driver you are using). This may be a big contributor to your code running slower than desired. You should reuse the driver instance as much as possible, as selenium was designed to be used that way. A great solution to this would be creating the driver instance in the webcrawler.__init__() method.
class WebCrawler():
def __init__(self, st_date, end_date):
self.driver = webdriver.PhantomJS()
self.base_url = 'https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/'
self.st_date = st_date
self.end_date = end_date
def open_link(self, link):
self.driver.get(link)
html = driver.page_source
return html
# Alternatively using the requests library
class WebCrawler():
def __init__(self, st_date, end_date):
self.base_url = 'https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/'
self.st_date = st_date
self.end_date = end_date
def open_link(self, link):
response = requests.get(link)
html = response.text
return html
Side note: For class names, you should use CamelCase instead of lowercase. This is just a suggestion, but the original creator of python has created PEP8 to define a general style guide for writing python code. Check it out here: Class Naming
Another odd thing I found was that you are casting a string to... string. You do this at url = str(self.base_url). This doesn't hurt anything, but also doesn't help. I can't find any resources/links but I have a suspicion that this take extra time for the interpreter. Since speed is a concern, I recommend just using url = self.base_url since the base url is already a string.
I see that you are formatting and creating urls by hand, but if you want a bit more control and less bugs, check out the furl library.
def create_link(self, attachment):
f = furl(self.base_url)
# The '/=' operator means append to the end, docs: https://github.com/gruns/furl/blob/master/API.md#path
f.path /= attachment
# Cleanup and remove invalid characters in the url
f.path.normalize()
return f.url # returns the url as a string
Another potential issue is that the extract_table() method does not extract anything, it simple just formats the html in a way that is human readable. I won't go into depth on this, but I recommend learning CSS selectors or XPath selectors for easily pulling data from HTML.
In the date_list() method, you are trying to use the date1 variable, but have not defined it anywhere. I would break up the lambda in there, and expand it over a few lines, so you can easily read and understand what it is trying to do.
Below is the full, refactored, suggested code.
from datetime import timedelta, date
from bs4 import BeautifulSoup
import requests
from furl import furl
class WebCrawler():
def __init__(self, st_date, end_date):
self.base_url = 'https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/'
self.st_date = st_date
self.end_date = end_date
def date_list(self):
dates = []
total_days = int((self.end_date - self.st_date).days + 1)
for i in range(total_days):
date = self.st_date + timedelta(days=i)
dates.append(date.strftime(%Y-%m-%d))
return dates
def create_link(self, attachment):
f = furl(self.base_url)
# The '/=' operator means append to the end, docs: https://github.com/gruns/furl/blob/master/API.md#path
f.path /= attachment
# Cleanup and remove invalid characters in the url
f.path.normalize()
return f.url # returns the url as a string
def open_link(self, link):
response = requests.get(link)
html = response.text
return html
def extract_table(self, html):
soup = BeautifulSoup(html)
print(soup.prettify())
def output_to_csv(self):
pass
date1 = date(2018, 3, 1)
date2 = date(2019, 3, 5)
test = webcrawler(st_date=date1, end_date=date2)
date_list = test.date_list()
link = test.create_link(date_list[0])
html = test.open_link(link)
test.extract_table(html)

Python3 Pickle hitting recursion limit

I have the following block of code that when executed on my Ubuntu computer using Python3 hits the recursion error for pickling. I don't understand why since the object to be pickled is not particularly complex and doesn't involve any custom objects. In fact, it is only a list of some 500 elements (approximately); each element of the list is just a string. It seems to me that I should be able to serialize this object without issue. Why am I hitting a recursion limit error? I know I could up the recursion limit with import sys and sys.setrecursionlimit() but I am frankly surprised I have to do that for such a trivial object.
from urllib import request
from bs4 import BeautifulSoup
import pickle
def get_standard_and_poors_500_constituents():
# URL request, URL opener, read content.
req = request.Request(
"http://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
)
opener = request.urlopen(req)
# Convert bytes to UTF-8.
content = opener.read().decode()
soup = BeautifulSoup(content, "lxml")
# HTML table we actually need is the first.
tables = soup.find_all("table")
external_class = tables[0].findAll("a", {"class":"external text"})
c = [ext.string for ext in external_class if not "reports" in ext]
return c
sp500_constituents = get_standard_and_poors_500_constituents()
spdr_etf = "SPY"
sp500_index = "^GSPC"
def main():
import datetime as dt
today = dt.datetime.today().date()
fname = "sp500_constituents_" + str(today) + ".pkl"
with open(fname, "wb") as f:
pickle.dump(sp500_constituents, f)
if __name__ == "__main__":
main()

Resources