In my code, I am trying to use multiprocessing to find the max price of each coin given a URL. There are around 1400 coins that I have to get data for, so I implemented Python's multiprocessing Pool. I'm not sure if I am using it correctly, but I followed the example given from this website: https://docs.python.org/3.4/library/multiprocessing.html?highlight=process
Here is my code:
import requests
import json
from bs4 import BeautifulSoup
from multiprocessing import Pool
max_prices = []
def find_max (url):
# finds maximum price of a coin
r = requests.get(url)
cont = r.json()
prices = list(map(lambda x: x[1], cont["price_usd"]))
maxPrice = max(prices)
return maxPrice
with open("coins.txt", "r") as f:
data = json.load(f)
coin_slug = [d["slug"] for d in data]
coin_names = [d["name"] for d in data]
urls = []
for item in coin_slug:
url = "https://graphs2.coinmarketcap.com/currencies/"+item+"/"
urls.append(url)
if __name__ == '__main__':
with Pool(5) as p:
print(p.map(find_max, urls)
When I added this part of the code, it gave me an EOF error:
if __name__ == '__main__':
with Pool(5) as p:
print(p.map(find_max, urls)
You have unbalanced brackets in the last line. It should be
print(p.map(find_max, urls)).
Related
i have the following:
from multiprocessing import Pool
def process_elements(index_of_data_inputs):
<process>
if <condition>:
# i would like to change the size of data_inputs
if __name__ == '__main__':
pool = Pool() # Create a multiprocessing Pool
pool.map(process_elements, range(0, len(data_inputs)) # process data_inputs iterable with pool
how i can change the size of data_inputs and so change the number of times process_elements
is called?
the work behind that i would like to parallelize is:
i = 0
while i < len(elements):
new_elems = process_some_elements(x,y)
if len(new_elems) > 0:
elements = elements + new_elems
i += 1
Consider simple example of communication between processes with multiprocessing module in Python:
import multiprocessing
import queue
import random
def process_elements(num, comq):
val = random.random()
if val > 0.5:
comq.put(1)
return num, int(1000 * val)
if __name__ == '__main__':
# initial data
numbers = list(range(10))
# data structure fot communication between multiple processes
m = multiprocessing.Manager()
q = m.Queue()
with multiprocessing.Pool(processes=4) as pool:
# get answer for original data
ans = pool.starmap(process_elements, [(num, q) for num in numbers])
print(numbers)
print(ans)
# create additional data based on the answer for initial data
new_numbers = numbers[-1:]
try:
while True:
new_numbers.append(new_numbers[-1] + q.get_nowait())
except queue.Empty:
pass
# get answer for additional data
new_ans = pool.starmap(process_elements, [(num, q) for num in new_numbers[1:]])
print(new_numbers)
print(new_ans)
import requests
from requests import Session
from bs4 import BeautifulSoup
import re
from multiprocessing.dummy import Pool as ThreadPool
def get_total_pages():
tut = []
base_url = 'Your group '
for url in [base_url % i for i in range(1, 27)]:
tut.append(url)
print(tut)
#get_data_from_page(tut)
pool = ThreadPool(8)
results = pool.map(get_data_from_page, tut)
def get_data_from_page(tut):
f = open("emails.txt", 'a')
email = []
for a in tut:
link = s.get(a).text
soup = BeautifulSoup(link, 'lxml')
links = soup.find('div', class_="mens").find_all('span', class_="inviz")
for e in links:
emails = e.text
f.write(emails + ', ')
email.append(emails)
print(email)
def main():
get_total_pages()
if __name__ == '__main__':
main()
This results in an error saying it only works with multiprocessing, and:
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?
problem was in this
for a in tut:
link = s.get(a).text
and was needed
just
link = s.get(a).text
#without for
import requests
from requests import Session
from bs4 import BeautifulSoup
import re
from multiprocessing.dummy import Pool as ThreadPool
#s = Session()
def get_photo_from_page():
tut = []
r = requests.get('https://vk.com/uporols_you').text
soup = BeautifulSoup(r, 'lxml')
im = soup.find_all('img', class_="ph_img")
for a in im:
s = a.get('data-src_big').split('|')[0]
tut.append(s)
y = "img%s.jpg"
for t, im in tut, [y % i for i in range(1,5)]:
p = requests.get(t)
out = open(im, "wb")
out.write(p.content)
out.close()
def main():
get_photo_from_page()
if __name__ == '__main__':
main()
error from cmd for t, im in tut, [y % i for i in range(1,5)]:
ValueError: too many values to unpack (expected 2)
> I need to list with a 1 to 1 accrue to URL, and on passage possylke,
and save all images with the new name, in separate cycles, it always
takes the last available reference and stores it as the number of
times indicated in the cycle.
import requests
from requests import Session
from bs4 import BeautifulSoup
import re
from multiprocessing.dummy import Pool as ThreadPool
#s = Session()
def get_photo_from_page():
tut = []
r = requests.get('https://m.vk.com/uporols_you').text
soup = BeautifulSoup(r, 'lxml')
im = soup.find_all('img', class_="ph_img")
try:
for a in im:
s = a.get('data-src_big').split('|')[0]
tut.append(s)
print(tut)
except:
print('no have any links)')
for num, link in enumerate(tut, start=1):
p = requests.get(link)
out = open("img%s.jpg" % (num), 'wb')
out.write(p.content)
out.close()
def main():
get_photo_from_page()
if __name__ == '__main__':
main()
I have the following block of code that when executed on my Ubuntu computer using Python3 hits the recursion error for pickling. I don't understand why since the object to be pickled is not particularly complex and doesn't involve any custom objects. In fact, it is only a list of some 500 elements (approximately); each element of the list is just a string. It seems to me that I should be able to serialize this object without issue. Why am I hitting a recursion limit error? I know I could up the recursion limit with import sys and sys.setrecursionlimit() but I am frankly surprised I have to do that for such a trivial object.
from urllib import request
from bs4 import BeautifulSoup
import pickle
def get_standard_and_poors_500_constituents():
# URL request, URL opener, read content.
req = request.Request(
"http://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
)
opener = request.urlopen(req)
# Convert bytes to UTF-8.
content = opener.read().decode()
soup = BeautifulSoup(content, "lxml")
# HTML table we actually need is the first.
tables = soup.find_all("table")
external_class = tables[0].findAll("a", {"class":"external text"})
c = [ext.string for ext in external_class if not "reports" in ext]
return c
sp500_constituents = get_standard_and_poors_500_constituents()
spdr_etf = "SPY"
sp500_index = "^GSPC"
def main():
import datetime as dt
today = dt.datetime.today().date()
fname = "sp500_constituents_" + str(today) + ".pkl"
with open(fname, "wb") as f:
pickle.dump(sp500_constituents, f)
if __name__ == "__main__":
main()
I am running python3 on a Ubuntu machine and have noticed that the following block of code is fickle. Sometimes it runs just fine, other times it produces a segmentation fault. I don't understand why. Can someone explain what might be going on?
Basically what the code does is try to read S&P companies from Wikipedia and write the list of tickers to a file in the same directory as the script. If no connection to Wikipedia can be established, the script tries instead to read an existing list from file.
from urllib import request
from urllib.error import URLError
from bs4 import BeautifulSoup
import os
import pickle
import dateutil.relativedelta as dr
import sys
sys.setrecursionlimit(100000)
def get_standard_and_poors_500_constituents():
fname = (
os.path.abspath(os.path.dirname(__file__)) + "/sp500_constituents.pkl"
)
try:
# URL request, URL opener, read content.
req = request.Request(
"http://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
)
opener = request.urlopen(req)
# Convert bytes to UTF-8.
content = opener.read().decode()
soup = BeautifulSoup(content, "lxml")
# HTML table we actually need is the first.
tables = soup.find_all("table")
external_class = tables[0].findAll("a", {"class":"external text"})
c = [ext.string for ext in external_class if not "reports" in ext]
with open(fname, "wb") as f:
pickle.dump(c, f)
except URLError:
with open(fname, "rb") as f:
c = pickle.load(f)
finally:
return c
sp500_constituents = get_standard_and_poors_500_constituents()
spdr_etf = "SPY"
sp500_index = "^GSPC"
def main():
X = get_standard_and_poors_500_constituents()
print(X)
if __name__ == "__main__":
main()