segmentation fault in python3 - python-3.x

I am running python3 on a Ubuntu machine and have noticed that the following block of code is fickle. Sometimes it runs just fine, other times it produces a segmentation fault. I don't understand why. Can someone explain what might be going on?
Basically what the code does is try to read S&P companies from Wikipedia and write the list of tickers to a file in the same directory as the script. If no connection to Wikipedia can be established, the script tries instead to read an existing list from file.
from urllib import request
from urllib.error import URLError
from bs4 import BeautifulSoup
import os
import pickle
import dateutil.relativedelta as dr
import sys
sys.setrecursionlimit(100000)
def get_standard_and_poors_500_constituents():
fname = (
os.path.abspath(os.path.dirname(__file__)) + "/sp500_constituents.pkl"
)
try:
# URL request, URL opener, read content.
req = request.Request(
"http://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
)
opener = request.urlopen(req)
# Convert bytes to UTF-8.
content = opener.read().decode()
soup = BeautifulSoup(content, "lxml")
# HTML table we actually need is the first.
tables = soup.find_all("table")
external_class = tables[0].findAll("a", {"class":"external text"})
c = [ext.string for ext in external_class if not "reports" in ext]
with open(fname, "wb") as f:
pickle.dump(c, f)
except URLError:
with open(fname, "rb") as f:
c = pickle.load(f)
finally:
return c
sp500_constituents = get_standard_and_poors_500_constituents()
spdr_etf = "SPY"
sp500_index = "^GSPC"
def main():
X = get_standard_and_poors_500_constituents()
print(X)
if __name__ == "__main__":
main()

Related

Extracting links from a URL until there is data and then moving to the next URL

I have a URL from where I am trying to extract data. I have found a way to extract data from that url. but how do i then move to the next URL if the existing URL doesn't have any data?
The base URL have used in main function is,
https://posoco.in/reports/daily-reports/
instead I only want to extract data from 2020-21 and then go on from here,
https://posoco.in/reports/daily-reports/daily-reports-2020-21/
this way if the pdf files from 2020-21 is all extracted, then the program should start extracting from the next URL which is 2021-22 so on as long as the website exists in an automated way so program can check them automated every-year.
The code I have written.
#import libraries
import re
import tabula
import datetime
from datetime import datetime,timedelta
from datetime import timedelta, date
import requests
import pandas as pd
import glob
import logging
import os
import urllib.request
import urllib.error
import urllib.parse
from bs4 import BeautifulSoup
import wget
def source_urls(url):
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
filelink = []
for link in tags:
if ('daily-reports-' in link.get('href','')):
filelink.append(link.get('href'))
return filelink
def get_urls(url):
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
filelink = []
for link in tags:
if ('_nldc_psp/?wpdmdl' in link.get('href','')):
filelink.append(link.get('href'))
return filelink
if __name__ == "__main__":
url = 'https://posoco.in/reports/daily-reports/'
file_links = source_urls(url)
sorted_file_links = sorted(file_links)
for files in sorted_file_links:
sub_files = get_urls(files)
for x in sub_files:
print(x)
The program output
https://posoco.in/download/31.03.14_nldc_psp/?wpdmdl=3256
https://posoco.in/download/30.03.14_nldc_psp/?wpdmdl=3255
https://posoco.in/download/29.03.14_nldc_psp/?wpdmdl=3254
https://posoco.in/download/28.03.14_nldc_psp/?wpdmdl=3253
....
...
...
...
...
https://posoco.in/download/11-03-21_nldc_psp/?wpdmdl=35681
https://posoco.in/download/10-03-21_nldc_psp/?wpdmdl=35649
https://posoco.in/download/09-03-21_nldc_psp/?wpdmdl=35627
https://posoco.in/download/08-03-21_nldc_psp/?wpdmdl=35612
https://posoco.in/download/07-03-21_nldc_psp/?wpdmdl=35589
I have pasted all the libraries, but only a few are used here. rest are used to download and for processing and logging.
Well, you've to set an condition where it's will pickup the main urls which is equal/bigger than 2021-20
And then you can parse the inner urls.
Also, There's no need to use .get here!
Since you are using a condition to pickup urls include _nldc_psp/?wpdmdlwhich means if there's a pattern then return it. so you DONNOT need to replace the value with empty values link.get('href','') and then return it!
import httpx
import trio
from bs4 import BeautifulSoup
from datetime import datetime
from pprint import pprint as pp
cond = datetime.strptime('2020-21', '%Y-%d')
async def get_urls(client):
r = await client.get('https://posoco.in/reports/daily-reports/')
soup = BeautifulSoup(r.text, 'lxml')
return [x['href'] for x in soup.select('a[href*=reports-]') if datetime.strptime(x['href'].split('-', 3)[-1][:-1], '%Y-%d') >= cond]
async def main():
async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
links = await get_urls(client)
async def get_pdf(url):
r = await client.get(url)
soup = BeautifulSoup(r.text, 'lxml')
pp([x['href']
for x in soup.select('a[href*="nldc_psp"]')])
if links:
for link in links:
nurse.start_soon(get_pdf, link)
if __name__ == "__main__":
trio.run(main)

how to download and iterate over csv file

I'm trying to download and iterate over csv file but I'm only reading the headers but no more lines after it
tried using this answer but with no luck
this is my code:
from datetime import datetime
import requests
import csv
def main():
print("python main function")
datetime_object = datetime.now().date()
url = f'https://markets.cboe.com/us/equities/market_statistics/volume_reports/day/{datetime_object}/csv/?mkt=bzx'
print(url)
response = requests.get(url, stream=True)
csv_content = response.content.decode('utf-8')
print(csv_content)
cr = csv.reader(csv_content.splitlines(), delimiter='~')
my_list = list(cr)
for row in my_list:
print(row)
if __name__ == '__main__':
main()
cr = csv.reader(csv_content.splitlines(), delimiter='~')
change to
cr = csv.reader(csv_content.splitlines(), delimiter=',')
And check if You download full file or file with header only use URL in browser ;)

Get the name of Instagram profile and the date of post with Python

I'm in the process of learning python3 and I try to solve a simple task. I want to get the name of account and the date of post from instagram link.
import requests
from bs4 import BeautifulSoup
html = requests.get('https://www.instagram.com/p/BuPSnoTlvTR')
soup = BeautifulSoup(html.text, 'lxml')
item = soup.select_one("meta[property='og:description']")
name = item.find_previous_sibling().get("content").split("•")[0]
print(name)
This code works sometimes with links like this https://www.instagram.com/kingtop
But I need it to work also with post of image like this https://www.instagram.com/p/BuxB00KFI-x/
That's all what I could make, but this is not working. And I can't get the date also.
Do you have any ideas? I appreciate any help.
I found a way to get the name of account. Now I'm trying to find a way to get an upload date
import requests
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
import time
from multiprocessing import Pool
from requests.exceptions import HTTPError
start = time.time()
file = open('users.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()
for url in urls:
url = url.strip ('\n')
try:
req = requests.get(url)
req.raise_for_status()
except HTTPError as http_err:
output = open('output2.txt', 'a')
output.write(f'не найдена\n')
except Exception as err:
output = open('output2.txt', 'a')
output.write(f'не найдены\n')
else:
output = open('output2.txt', 'a')
soup = BeautifulSoup(req.text, "lxml")
the_url = soup.select("[rel='canonical']")[0]['href']
the_url2=the_url.replace('https://www.instagram.com/','')
head, sep, tail = the_url2.partition('/')
output.write (head+'\n')

Error with cycle For and requests Python3 Parsing

import requests
from requests import Session
from bs4 import BeautifulSoup
import re
from multiprocessing.dummy import Pool as ThreadPool
#s = Session()
def get_photo_from_page():
tut = []
r = requests.get('https://vk.com/uporols_you').text
soup = BeautifulSoup(r, 'lxml')
im = soup.find_all('img', class_="ph_img")
for a in im:
s = a.get('data-src_big').split('|')[0]
tut.append(s)
y = "img%s.jpg"
for t, im in tut, [y % i for i in range(1,5)]:
p = requests.get(t)
out = open(im, "wb")
out.write(p.content)
out.close()
def main():
get_photo_from_page()
if __name__ == '__main__':
main()
error from cmd for t, im in tut, [y % i for i in range(1,5)]:
ValueError: too many values to unpack (expected 2)
> I need to list with a 1 to 1 accrue to URL, and on passage possylke,
and save all images with the new name, in separate cycles, it always
takes the last available reference and stores it as the number of
times indicated in the cycle.
import requests
from requests import Session
from bs4 import BeautifulSoup
import re
from multiprocessing.dummy import Pool as ThreadPool
#s = Session()
def get_photo_from_page():
tut = []
r = requests.get('https://m.vk.com/uporols_you').text
soup = BeautifulSoup(r, 'lxml')
im = soup.find_all('img', class_="ph_img")
try:
for a in im:
s = a.get('data-src_big').split('|')[0]
tut.append(s)
print(tut)
except:
print('no have any links)')
for num, link in enumerate(tut, start=1):
p = requests.get(link)
out = open("img%s.jpg" % (num), 'wb')
out.write(p.content)
out.close()
def main():
get_photo_from_page()
if __name__ == '__main__':
main()

Python3 Pickle hitting recursion limit

I have the following block of code that when executed on my Ubuntu computer using Python3 hits the recursion error for pickling. I don't understand why since the object to be pickled is not particularly complex and doesn't involve any custom objects. In fact, it is only a list of some 500 elements (approximately); each element of the list is just a string. It seems to me that I should be able to serialize this object without issue. Why am I hitting a recursion limit error? I know I could up the recursion limit with import sys and sys.setrecursionlimit() but I am frankly surprised I have to do that for such a trivial object.
from urllib import request
from bs4 import BeautifulSoup
import pickle
def get_standard_and_poors_500_constituents():
# URL request, URL opener, read content.
req = request.Request(
"http://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
)
opener = request.urlopen(req)
# Convert bytes to UTF-8.
content = opener.read().decode()
soup = BeautifulSoup(content, "lxml")
# HTML table we actually need is the first.
tables = soup.find_all("table")
external_class = tables[0].findAll("a", {"class":"external text"})
c = [ext.string for ext in external_class if not "reports" in ext]
return c
sp500_constituents = get_standard_and_poors_500_constituents()
spdr_etf = "SPY"
sp500_index = "^GSPC"
def main():
import datetime as dt
today = dt.datetime.today().date()
fname = "sp500_constituents_" + str(today) + ".pkl"
with open(fname, "wb") as f:
pickle.dump(sp500_constituents, f)
if __name__ == "__main__":
main()

Resources