import requests
from requests import Session
from bs4 import BeautifulSoup
import re
from multiprocessing.dummy import Pool as ThreadPool
#s = Session()
def get_photo_from_page():
tut = []
r = requests.get('https://vk.com/uporols_you').text
soup = BeautifulSoup(r, 'lxml')
im = soup.find_all('img', class_="ph_img")
for a in im:
s = a.get('data-src_big').split('|')[0]
tut.append(s)
y = "img%s.jpg"
for t, im in tut, [y % i for i in range(1,5)]:
p = requests.get(t)
out = open(im, "wb")
out.write(p.content)
out.close()
def main():
get_photo_from_page()
if __name__ == '__main__':
main()
error from cmd for t, im in tut, [y % i for i in range(1,5)]:
ValueError: too many values to unpack (expected 2)
> I need to list with a 1 to 1 accrue to URL, and on passage possylke,
and save all images with the new name, in separate cycles, it always
takes the last available reference and stores it as the number of
times indicated in the cycle.
import requests
from requests import Session
from bs4 import BeautifulSoup
import re
from multiprocessing.dummy import Pool as ThreadPool
#s = Session()
def get_photo_from_page():
tut = []
r = requests.get('https://m.vk.com/uporols_you').text
soup = BeautifulSoup(r, 'lxml')
im = soup.find_all('img', class_="ph_img")
try:
for a in im:
s = a.get('data-src_big').split('|')[0]
tut.append(s)
print(tut)
except:
print('no have any links)')
for num, link in enumerate(tut, start=1):
p = requests.get(link)
out = open("img%s.jpg" % (num), 'wb')
out.write(p.content)
out.close()
def main():
get_photo_from_page()
if __name__ == '__main__':
main()
Related
In my code, I am trying to use multiprocessing to find the max price of each coin given a URL. There are around 1400 coins that I have to get data for, so I implemented Python's multiprocessing Pool. I'm not sure if I am using it correctly, but I followed the example given from this website: https://docs.python.org/3.4/library/multiprocessing.html?highlight=process
Here is my code:
import requests
import json
from bs4 import BeautifulSoup
from multiprocessing import Pool
max_prices = []
def find_max (url):
# finds maximum price of a coin
r = requests.get(url)
cont = r.json()
prices = list(map(lambda x: x[1], cont["price_usd"]))
maxPrice = max(prices)
return maxPrice
with open("coins.txt", "r") as f:
data = json.load(f)
coin_slug = [d["slug"] for d in data]
coin_names = [d["name"] for d in data]
urls = []
for item in coin_slug:
url = "https://graphs2.coinmarketcap.com/currencies/"+item+"/"
urls.append(url)
if __name__ == '__main__':
with Pool(5) as p:
print(p.map(find_max, urls)
When I added this part of the code, it gave me an EOF error:
if __name__ == '__main__':
with Pool(5) as p:
print(p.map(find_max, urls)
You have unbalanced brackets in the last line. It should be
print(p.map(find_max, urls)).
import time
import datetime as dt
import urllib.request
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import matplotlib.animation as Animation
from matplotlib import style
import matplotlib
import csv
import threading
style.use('fivethirtyeight')
fig = plt.figure()
def usd_in_bitcoin():
try:
resp = urllib.request.urlopen("https://bitcoinwisdom.com/")
except Exception as e:
print(e)
text = resp.read()
soup = BeautifulSoup(text, 'html.parser')
intermediate = soup.find('tr', {"id": "o_btcusd"})
ans = intermediate.find('td', {'class': 'r'})
return ans.contents[0]
def write_to_file(interval):
while True:
value = str(usd_in_bitcoin())
unix_time = str(time.time())
print(unix_time, value)
with open('bitcoin_usd.csv', 'a+') as file:
file.write(unix_time)
file.write("," + str(value))
file.write('\n')
time.sleep(interval)
def animate(i):
with open('bitcoin_usd.csv') as csv_file:
readcsv = csv.reader(csv_file, delimiter=',')
xs = []
ys = []
for row in readcsv:
if len(row) > 1:
x, y = [float(s) for s in row]
xs.append(dt.datetime.fromtimestamp(x))
ys.append(y)
print(len(xs))
dates = matplotlib.dates.date2num(xs)
# print(dates)
fig.clear()
plt.plot_date(dates, ys)
def plotting():
ani = Animation.FuncAnimation(fig, animate, interval=1000)
plt.show()
def main():
# plotting()
b = threading.Thread(name='making graph', target=plotting)
# a = threading.Thread(name='updating_csv', target=write_to_file, args=(5,))
# a.start()
b.start()
if __name__ == '__main__':
main()
In the above block of code, I am trying to plot the value of a bitcoin in usd by using scraping and then putting the value in a csv file.
Then I read the csv file to plot the graph.
Both plotting and scraping seem to work fine but if I do both of them simultaneously, I am getting an error saying main thread not in main loop. I searched a lot but was not able to solve this problem
The problem here is with the sequence of lines in main()
Try this:
def main():
a = threading.Thread(name='updating_csv', target=write_to_file, args=(5,))
a.start()
b = threading.Thread(name='making graph', target=plotting)
b.start()
plotting()
ho to get url from attribute url i mean the url himself in style? style="width: 433px; height: 510px; background-image: url(https://cs7056.vk.me/c635104/v635104607/1c316/ADzy-2WY8pw.jpg)" Selenium3 Python3 easy for you!
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import re
import time
url = 'https://vk.com/uporols_you'
driver = webdriver.Firefox(executable_path='C:/Users/PANDEMIC/AppData/Local/Mozilla/geckodriver.exe')
def login(driver):
log_page = driver.get('https://login.vk.com/?act=login')
find_login_input = driver.find_element_by_id('login_form').find_element_by_id('email').send_keys('+77782303865')
find_password_input = driver.find_element_by_id('login_form').find_element_by_id('pass').send_keys('pass')
find_button = driver.find_element_by_xpath('//button[#id="login_button"]').click()
time.sleep(5)
def get_photo_from_page(driver):
driver.get(url)
try:
driver.find_element_by_class_name('popup_box_container').find_element_by_class_name('box_title_wrap').find_element_by_class_name('box_x_button').click()
except:
print('nope nothing')
for i in range(2):
scrol_down = driver.find_element_by_id('public_wall').find_element_by_id('wall_more_link').click()
time.sleep(2)
tut = []
#t = (a[#class="page_post_thumb_wrap image_cover page_post_thumb_last_column page_post_thumb_last_row"])
for ii in driver.find_elements_by_xpath('//a[#style]'):
o = ii.get_attribute('style')
print(o)
#soup = BeautifulSoup(htlm, 'lxml')
#im = soup.find_all('a', class_="'page_post_thumb_wrap image_cover page_post_thumb_last_column page_post_thumb_last_row'")
#print(htlm)
#for a in im:
# s = a.get('data-src_big').split('|')[0]
# tut.append(s)
#print(tut)
#for num, link in enumerate(tut, start=1):
# p = requests.get(link)
# out = open("img%s.jpg" % (num), 'wb')
# out.write(p.content)
# out.close()
def main():
login(driver)
get_photo_from_page(driver)
if __name__ == '__main__':
main()
In that particular case, you could just parse the style string that you were already able to gather with your script.
Just add this function to your code:
def parse_style_attribute(style_string):
if 'background-image' in style_string:
style_string = style_string.split(' url("')[1].replace('");', '')
return style_string
return None
This is a simple string parsing which extract the url if there is "background-image" in the string, or return None if there is no image.
You can then use it in your code:
links = list()
for ii in driver.find_elements_by_xpath('//a[#style]'):
o = ii.get_attribute('style')
links.append(parse_style_attribute(o))
links = [link for link in links if link is not None]
import requests
from requests import Session
from bs4 import BeautifulSoup
import re
from multiprocessing.dummy import Pool as ThreadPool
def get_total_pages():
tut = []
base_url = 'Your group '
for url in [base_url % i for i in range(1, 27)]:
tut.append(url)
print(tut)
#get_data_from_page(tut)
pool = ThreadPool(8)
results = pool.map(get_data_from_page, tut)
def get_data_from_page(tut):
f = open("emails.txt", 'a')
email = []
for a in tut:
link = s.get(a).text
soup = BeautifulSoup(link, 'lxml')
links = soup.find('div', class_="mens").find_all('span', class_="inviz")
for e in links:
emails = e.text
f.write(emails + ', ')
email.append(emails)
print(email)
def main():
get_total_pages()
if __name__ == '__main__':
main()
This results in an error saying it only works with multiprocessing, and:
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?
problem was in this
for a in tut:
link = s.get(a).text
and was needed
just
link = s.get(a).text
#without for
I am running python3 on a Ubuntu machine and have noticed that the following block of code is fickle. Sometimes it runs just fine, other times it produces a segmentation fault. I don't understand why. Can someone explain what might be going on?
Basically what the code does is try to read S&P companies from Wikipedia and write the list of tickers to a file in the same directory as the script. If no connection to Wikipedia can be established, the script tries instead to read an existing list from file.
from urllib import request
from urllib.error import URLError
from bs4 import BeautifulSoup
import os
import pickle
import dateutil.relativedelta as dr
import sys
sys.setrecursionlimit(100000)
def get_standard_and_poors_500_constituents():
fname = (
os.path.abspath(os.path.dirname(__file__)) + "/sp500_constituents.pkl"
)
try:
# URL request, URL opener, read content.
req = request.Request(
"http://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
)
opener = request.urlopen(req)
# Convert bytes to UTF-8.
content = opener.read().decode()
soup = BeautifulSoup(content, "lxml")
# HTML table we actually need is the first.
tables = soup.find_all("table")
external_class = tables[0].findAll("a", {"class":"external text"})
c = [ext.string for ext in external_class if not "reports" in ext]
with open(fname, "wb") as f:
pickle.dump(c, f)
except URLError:
with open(fname, "rb") as f:
c = pickle.load(f)
finally:
return c
sp500_constituents = get_standard_and_poors_500_constituents()
spdr_etf = "SPY"
sp500_index = "^GSPC"
def main():
X = get_standard_and_poors_500_constituents()
print(X)
if __name__ == "__main__":
main()