Python3 Pickle hitting recursion limit - python-3.x

I have the following block of code that when executed on my Ubuntu computer using Python3 hits the recursion error for pickling. I don't understand why since the object to be pickled is not particularly complex and doesn't involve any custom objects. In fact, it is only a list of some 500 elements (approximately); each element of the list is just a string. It seems to me that I should be able to serialize this object without issue. Why am I hitting a recursion limit error? I know I could up the recursion limit with import sys and sys.setrecursionlimit() but I am frankly surprised I have to do that for such a trivial object.
from urllib import request
from bs4 import BeautifulSoup
import pickle
def get_standard_and_poors_500_constituents():
# URL request, URL opener, read content.
req = request.Request(
"http://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
)
opener = request.urlopen(req)
# Convert bytes to UTF-8.
content = opener.read().decode()
soup = BeautifulSoup(content, "lxml")
# HTML table we actually need is the first.
tables = soup.find_all("table")
external_class = tables[0].findAll("a", {"class":"external text"})
c = [ext.string for ext in external_class if not "reports" in ext]
return c
sp500_constituents = get_standard_and_poors_500_constituents()
spdr_etf = "SPY"
sp500_index = "^GSPC"
def main():
import datetime as dt
today = dt.datetime.today().date()
fname = "sp500_constituents_" + str(today) + ".pkl"
with open(fname, "wb") as f:
pickle.dump(sp500_constituents, f)
if __name__ == "__main__":
main()

Related

difficulty reading online xml content with python, xml.etree.ElementTree and urllib

I am reading XML online in an RSS feed using python, xml.etree.ElementTree and urllib.
My code seems to be straightforward but is not giving me the results that I want
No matter what I do it always returns what looks like all the data in the XML stream
I am open to better suggestions on how to read specific strings into lists
see my code below
import xml.etree.ElementTree as ET
from urllib import request
title_list = []
def main():
try:
response = request.urlopen("https://www.abcdefghijkl.xml")
rsp_code = response.code
print(rsp_code)
if rsp_code == 200:
webdata = response.read()
print("1")
xml = webdata.decode('UTF-8')
print("2")
tree = ET.parse(xml)
print("3")
items = tree.findall('channel')
print("4")
for item in items:
title = item.find('title').text
title_list.append(title)
print(f"title_list 0 is, {title_list}")
print("5")
except Exception as e:
print(f'An error occurred {str(e)}')
main()
Thanks, everyone, I figured it out after an awesome Udemy video. I eventually used the bs4 library(beautiful soup)python library and requests. Heres the code below
import bs4
import requests
title_list = []
def main():
try:
result = requests.get("https://abcdefghijk.xml")
res_text = result.text
soup = bs4.BeautifulSoup(res_text, features="xml")
title_tag_list = soup.select('title')
for titles in title_tag_list:
title = titles.text
title_list.append(title)
print(f"title_list 0 is, {title_list}")
print("5")
except Exception as e:
print(f'An error occurred {str(e)}')
main()

Getting incorrect link on parsing web page in BeautifulSoup

I'm trying to get the download link from the button in this page. But when I open the download link that I get from my code I get this message
I noticed that if I manually click the button and open the link in a new page the csrfKey part of the link is always same whereas when I run the code I get a different key every time. Here's my code
from bs4 import BeautifulSoup
import requests
import re
def GetPage(link):
source_new = requests.get(link).text
soup_new = BeautifulSoup(source_new, 'lxml')
container_new = soup_new.find_all(class_='ipsButton')
for data_new in container_new:
#print(data_new)
headline = data_new # Display text
match = re.findall('download', str(data_new), re.IGNORECASE)
if(match):
print(f'{headline["href"]}\n')
if __name__ == '__main__':
link = 'https://eci.gov.in/files/file/10985-5-number-and-types-of-constituencies/'
GetPage(link)
Before you get to the actual download links of the files, you need to agree to Terms and Conditions. So, you need to fake this with requests and then parse the next page you get.
Here's how:
import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
link = 'https://eci.gov.in/files/file/10985-5-number-and-types-of-constituencies/'
with requests.Session() as connection:
r = connection.get("https://eci.gov.in/")
confirmation_url = BeautifulSoup(
connection.get(link).text, 'lxml'
).select_one(".ipsApp .ipsButton_fullWidth")["href"]
fake_agree_to_continue = connection.get(
confirmation_url.replace("?do=download", "?do=download&confirm=1")
).text
download_links = [
a["href"] for a in
BeautifulSoup(
fake_agree_to_continue, "lxml"
).select(".ipsApp .ipsButton_small")[1:]]
for download_link in download_links:
response = connection.get(download_link)
file_name = (
response
.headers["Content-Disposition"]
.replace('"', "")
.split(" - ")[-1]
)
print(f"Downloading: {file_name}")
with open(file_name, "wb") as f:
f.write(response.content)
This should output:
Downloading: Number And Types Of Constituencies.pdf
Downloading: Number And Types Of Constituencies.xls
And save two files: a .pdf and a .xls. The later one looks like this:

Trying to scrape multiple pages sequentially using loop to change url

First off, sorry... I am sure that this is a common problem but i did not find the solution anywhere eventhough i searched for a while.
I am trying to create list by scraping data from classicdb. The two problems i have is.
The scraping as written in the try loop does not work in inside the for loop but on its own it works. Currently it just returns the 0 even though there should be values to return.
The output that i get from the try loop gernerates new lists but i want to just get the value and append it later.
I have tried the try function outside the for loop and there it worked.
I also saw some solutions where a while true was used but that did not work for me.
from lxml.html import fromstring
import requests
import traceback
import time
from bs4 import BeautifulSoup as bs
Item_name=[]
Sell_Copper=[]
items= [47, 48]
url = 'https://classic.wowhead.com/item='
fails=[]
for i in items:
time.sleep(5)
url1=(url+str(i))
session = requests.session()
response = session.get(url1)
soup = bs(response.content, 'lxml')
name=soup.select_one('h1').text
print(name)
#get the buy prices
try:
copper = soup.select_one('li:contains("Sells for") .moneycopper').text
except Exception as e:
copper=str(0)
The expected result would be that i get one value in gold and a list in P_Gold. In this case:
copper='1'
Sell_copper=['1','1']
You don't need a sleep. It needs to be div:contains and the search text needs changing
import requests
from bs4 import BeautifulSoup as bs
Item_name=[]
Sell_Copper=[]
items= [47, 48]
url = 'https://classic.wowhead.com/item='
fails=[]
with requests.Session() as s:
for i in items:
response = s.get(url + str(i))
soup = bs(response.content, 'lxml')
name = soup.select_one('h1').text
print(name)
try:
copper = soup.select_one('div:contains("Sell Price") .moneycopper').text
except Exception as e:
copper=str(0)
print(copper)

Why is being indent's wrong causes wrong function?

I cannot understand why this error happen.
First,I wrote
import urllib.request
from bs4 import BeautifulSoup
import time
import os
def download_image(url,name):
path = "./scrape_image/"
imagename = str(name) + ".jpg"
if not os.path.exists(path):
os.makedirs(path)
print(path)
urllib.request.urlretrieve(url,path+imagename)
url = "https://api.XXXkeyword=YYY&limit=1000"
response = urllib.request.urlopen(url)
rss = response.read().decode("utf-8")
soup = BeautifulSoup(rss, "xml")
name=0
for s in soup.find_all("photo"):
url = s.find_all("image_url")[0].string
name+=1
download_image(url, name)
by running this code,I can get 1 image from the API.But originally the right code can get 1000 images from the API.I fixed indents in the first code, so my code is like
import urllib.request
from bs4 import BeautifulSoup
import time
import os
def download_image(url,name):
path = "./image/"
imagename = str(name) + ".jpg"
if not os.path.exists(path):
os.makedirs(path)
print(path)
urllib.request.urlretrieve(url, path+imagename)
time.sleep(1)
url = "https://api.XXXkeyword=YYY&limit=1000"
response = urllib.request.urlopen(url)
rss = response.read().decode("utf-8")
soup = BeautifulSoup(rss, "xml")
name = 0
for s in soup.find_all("photo"):
url = s.find_all("image_url")[0].string
name+=1
download_image(url,name)
At last,I can get 1000 images from the API. But I cannot understand why I can do so by fixing the indent.Please give me some explanations.
Because in the first example you're only getting the image if your condition passes:
if not os.path.exists(path):
And that condition will only pass once because you immediately create the path:
os.makedirs(path)
For every other iteration of the loop, the condition is false. So the code within the conditional block doesn't execute.
Basically, an if block only executes if the condition is true. When you move your code out of the if block, it always executes regardless of the condition.

segmentation fault in python3

I am running python3 on a Ubuntu machine and have noticed that the following block of code is fickle. Sometimes it runs just fine, other times it produces a segmentation fault. I don't understand why. Can someone explain what might be going on?
Basically what the code does is try to read S&P companies from Wikipedia and write the list of tickers to a file in the same directory as the script. If no connection to Wikipedia can be established, the script tries instead to read an existing list from file.
from urllib import request
from urllib.error import URLError
from bs4 import BeautifulSoup
import os
import pickle
import dateutil.relativedelta as dr
import sys
sys.setrecursionlimit(100000)
def get_standard_and_poors_500_constituents():
fname = (
os.path.abspath(os.path.dirname(__file__)) + "/sp500_constituents.pkl"
)
try:
# URL request, URL opener, read content.
req = request.Request(
"http://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
)
opener = request.urlopen(req)
# Convert bytes to UTF-8.
content = opener.read().decode()
soup = BeautifulSoup(content, "lxml")
# HTML table we actually need is the first.
tables = soup.find_all("table")
external_class = tables[0].findAll("a", {"class":"external text"})
c = [ext.string for ext in external_class if not "reports" in ext]
with open(fname, "wb") as f:
pickle.dump(c, f)
except URLError:
with open(fname, "rb") as f:
c = pickle.load(f)
finally:
return c
sp500_constituents = get_standard_and_poors_500_constituents()
spdr_etf = "SPY"
sp500_index = "^GSPC"
def main():
X = get_standard_and_poors_500_constituents()
print(X)
if __name__ == "__main__":
main()

Resources