I have gotten a code and after working out the indentation problem in it, it runs without errors, however now I cannot print the code into a list.
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import requests
symbol = 'AAPL'
url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=" + symbol + "&type=&dateb=&owner=exclude&start=0&count=100&output=atom"
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
html = soup(page_html, 'html.parser')
entries = html.findAll("entry")
shouldContinue = True
link = ""
for entry in entries:
if shouldContinue and (
entry.find("category")["term"].lower() == "10-k" or entry.find("category")["term"].lower() == "10-q" or
entry.find("category")["term"].lower() == "20-f"):
firstUrl = entry.find("link")["href"]
uClientFirstUrl = uReq(firstUrl)
page_html_firstUrl = uClientFirstUrl.read()
uClientFirstUrl.close()
htmlFirstUrl = soup(page_html_firstUrl, 'html.parser')
tds = htmlFirstUrl.findAll("table")[1].findAll("td")
foundtd = False
for td in tds:
if foundtd == True:
link = "https://www.sec.gov" + td.find("a")["href"]
foundtd = False
if "xbrl instance" in td.text.lower():
foundtd = True
shouldContinue = False
def getCash(url, symbol):
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
xml = soup(page_html, 'xml')
cash = xml.findAll("us-gaap:CashAndCashEquivalentsAtCarryingValue")
if len(cash) == 0:
cash = xml.findAll("ifrs-full:Cash")
if len(cash) == 0:
cash = xml.findAll("us-gaap:CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents")
if len(cash) == 0:
cash = xml.findAll("us-gaap:Cash")
return cash
print(getCash)
getCash(url, symbol)
I have tried printing the assignment, as well as calling the method without any success. A sense of direction would be appreciated. Thank you.
As mentioned in my comment above:
What effect do you expect from print(getCash)? If you want it to print the return from the getCash() function, delete it (it's not doing anything), and wrap your getCash(url, symbol) call in a print() function.
Basically, do this:
print(getCash(url, symbol))
Related
I tried making a python script that gets all the fighter names and their records from boxrec.com. The issue is that it doesn't retrieve them all (Floyd Mayweather is missing) and some of them appear several times (Success Tetteh for example).
The output is too big to post it here: https://cryptpad.fr/pad/#/2/pad/view/mYd4jIMOxY7QNUqW2-5TvYIvvx84KXbiMdYvXINGV9M/
Edit: For some fighters the records are wrong (Vasyl Lomachenko for example appears to have 28 wins, but he has 14)
import numpy
from requests import Session
from bs4 import BeautifulSoup
import pandas as pd
import pyautogui
import time
def main():
fighter_names = []
fighter_wins = []
fighter_losses = []
fighter_draws = []
username = "username"
password = "password"
site = "https://boxrec.com/en/login"
payload = {
'_username': username,
'_password': password,
'login[go]': None
}
with Session() as s:
s.get(site)
s.post(site, data=payload, headers={
"Content-Type": "application/x-www-form-urlencoded"
})
pages = numpy.arange(1, 19152, 20)
for page in pages:
page = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(page.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
if not names_a:
print("solving captcha")
page = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(page.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
pyautogui.click(x=118, y=1061)
time.sleep(1)
pyautogui.click(x=1035, y=619)
time.sleep(2)
pyautogui.click(x=97, y=59)
time.sleep(1)
pyautogui.click(x=834, y=247)
time.sleep(2)
if not names_a:
print("please solve captcha manually")
while not names_a:
page = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(page.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
wins_span = soup.find_all('span', class_='textWon')
loses_span = soup.find_all('span', class_='textLost')
draws_span = soup.find_all('span', class_='textDraw')
for container in names_a:
name = container.text
print(name)
fighter_names.append(name)
for container in wins_span:
wins = container.text
fighter_wins.append(wins)
for container in loses_span:
losses = container.text
fighter_losses.append(losses)
for container in draws_span:
draws = container.text
fighter_draws.append(draws)
fighters = {
"name": fighter_names,
"wins": fighter_wins,
"loses": fighter_losses,
"draws": fighter_draws
}
df = pd.DataFrame.from_dict(fighters, orient="index")
df = df.transpose()
df.to_csv("fighters.csv")
if __name__ == '__main__':
main()
I would refrain from using the same variable name to represent 2 separate things...Looks like you have page variable being used in 2 separate instances, which can be confusing.
As far as some of the issues, I'm assuming at some point there's a mismatch in the lists so the corresponding data isn't lining up with the correct fighter name, etc. or there's something off with the sites actual data/html. Not entirely sure as I haven't debugged. Reason being, have you considered using pandas to parse the table then just split the 'w-l-d' column? I think it would be far easier to let pandas do the parsing as to not miss something in the 900+ pages you need to go through.
See if this helps:
import numpy
from requests import Session
from bs4 import BeautifulSoup
import pandas as pd
import pyautogui
import time
import math
def main():
final_df = pd.DataFrame()
username = 'username'
password = 'password'
site = "https://boxrec.com/en/login"
payload = {
'_username': username,
'_password': password,
'login[go]': None
}
with Session() as s:
s.get(site)
s.post(site, data=payload, headers={
"Content-Type": "application/x-www-form-urlencoded"
})
pages = numpy.arange(1, 19152, 20)
for page in pages:
response = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(response.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
if not names_a:
print("solving captcha")
response = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(response.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
pyautogui.click(x=118, y=1061)
time.sleep(1)
pyautogui.click(x=1035, y=619)
time.sleep(2)
pyautogui.click(x=97, y=59)
time.sleep(1)
pyautogui.click(x=834, y=247)
time.sleep(2)
if not names_a:
print("please solve captcha manually")
while not names_a:
response = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(response.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
df = pd.read_html(response.text)[-1]
df = df[['name','w-l-d']]
df = df[df['w-l-d'].astype(str).str.match(r"(^\d*.\d*.\d*$)")] # <--- ADD THIS LINE
df[['wins','loses','draws']] = df['w-l-d'].str.split(expand=True)
df = df.drop('w-l-d', axis=1)
print('Page: %d of %d' %(((page-1)/20)+1,math.ceil(19152/20)))
final_df = final_df.append(df, sort=False).reset_index(drop=True)
final_df.to_csv("fighters.csv")
if __name__ == '__main__':
main()
I am web scraping with python from pacsun.com and I am trying to put it into a csv file but when I open the file only the headers print and not the product_name, price, or the new_arrival.
So my question is how do I get these values to print out under the headers in a csv file?
from bs4 import BeautifulSoup as soup
import csv
my_url = ('https://www.pacsun.com/mens/')
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, 'html.parser')
product_data = page_soup.findAll('div',{'class':'product-data'})
#print(len(product_data))
#print(product_data[0])
product = product_data[0]
filename = 'pacsun.csv'
f = open(filename,'w')
headers = 'product_name, price, new_arrival\n'
f.write(headers)
for product in product_data:
#name = product.div.a["title"]
product_name = print('product: ' + product.div.a["title"])
#the code above gets the title of the product
price = product.findAll('div',{'class':'product-price group'})
#the code above gets the price of the product
new_arrival = product.findAll('div',{'class':'new'})
#the code above gets
print(price[0].text)
print(new_arrival[0].text)
thewriter = csv.DictWriter(filename, headers)
thewriter.writerow({'product_name':product_name, 'price':price, 'new_arrival':new_arrival})
#f.write(product_name.replace(",", "|") + "," + price + ","+ new_arrival + "\n")
f.close()
You have a problem with the data. So, I fix it and it works fine. You only need to change w to a to be f = open(filename,'a') and to put f.write in the loop
from bs4 import BeautifulSoup as soup
import csv
from urllib.request import urlopen as uReq
my_url = ('https://www.pacsun.com/mens/')
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, 'html.parser')
product_data = page_soup.findAll('div',{'class':'product-data'})
#print(len(product_data))
#print(product_data[0])
product = product_data[0]
filename = 'pacsun.csv'
f = open(filename,"a")
headers = 'product_name, price, new_arrival\n'
f.write(headers)
for product in product_data:
#name = product.div.a["title"]
product_name = print('product: ' + product.div.a["title"])
#the code above gets the title of the product
price = product.findAll('div',{'class':'product-price group'})
#the code above gets the price of the product
new_arrival = product.findAll('div',{'class':'new'})
price_ = ''
new_arrival_ = ''
product_name_ = ''
# product_name_ = ' '.join([str(elem) for elem in product.div.a["title"]])
for price_text in price:
price_ = price_text.text
for new_arrival_text in new_arrival:
new_arrival_ = new_arrival_text.text
f.write(product.div.a["title"]+","+price_+ "," + new_arrival_ + "\n")
f.close()
I wrote a Program With Python For Scraping First Image Link of a query in WikipediaSomethings Like This Image:
My Python Program Require These Below Libraries:
requests
bs4
html
re
When I Run My Code then I give an argument it Returns a Defined Error('Image-Not-Found'). please Help Me To solve the problem.
My Python Program Source code:
import requests
import bs4
import re
import html
# Create the parser
my_parser = argparse.ArgumentParser(description='Wikipedia Image Grabber')
# Add the arguments
my_parser.add_argument('Phrase',
metavar='Phrase',
type=str,
help='Phrase to Search')
# Execute the parse_args() method
args = my_parser.parse_args()
Phrase = args._get_kwargs()[0][1]
if '.' in Phrase or '-' in Phrase:
if '.' in Phrase and '-' in Phrase:
Phrase = str(Phrase).replace('-',' ')
elif '-' in Phrase and not '.' in Phrase:
Phrase = str(Phrase).replace('-',' ')
Phrase = html.escape(Phrase)
request = requests.get('https://fa.wikipedia.org/wiki/Special:Search?search=%s&go=Go&ns0=1' % Phrase).text
parser = bs4.BeautifulSoup(request, 'html.parser')
none_search_finder = parser.find_all('p', attrs = {'class':'mw-search-nonefound'})
if len(none_search_finder)==1:
print('No-Result')
exit()
else:
search_results = parser.find_all('div' , attrs = {'class':'mw-search-result-heading'})
if len(search_results)==0:
search_result = parser.find_all('h1', attrs = {'id':'firstHeading'})
if len(search_result)!=0:
link = 'https://fa.wikipedia.org/wiki/'+str(Phrase)
else:
print('Result-Error')
exit()
else:
selected_result = search_results[0]
regex_exp = r".*<a href=\"(.*)\" title="
regex_get_uri = re.findall(regex_exp, str(selected_result))
regex_result = str(regex_get_uri[0])
link = 'https://fa.wikipedia.org'+regex_result
#---------------
second_request = requests.get(link)
second_request_source = second_request.text
second_request_parser = bs4.BeautifulSoup(second_request_source, 'html.parser')
image_finder = second_request_parser.find_all('a', attrs = {'class':'image'})
if len(image_finder) == 0:
print('No-Image')
exit()
else:
image_finder_e = image_finder[0]
second_regex = r".*src=\"(.*)\".*decoding=\"async\""
regex_finder = re.findall(second_regex, str(image_finder_e))
if len(regex_finder)!=0:
regexed_uri = str(regex_finder[0])
img_link = regexed_uri.replace('//','https://')
print(img_link)
else:
print("Image-Not-Found")
You can do it without regex and the reason your code is not working is that on browser and on response the decoding = "async" position is not same.
here is a solution without regex.
import re
import requests
from bs4 import BeautifulSoup
url = 'https://en.wikipedia.org/wiki/Google'
soup = BeautifulSoup(requests.get(url).text,'html.parser')
imglinks = soup.find_all('a', attrs = {'class':'image'})[0]
for img in imglinks.find_all('img'):
print(img['src'].replace('//','https://'))
Output:
https://upload.wikimedia.org/wikipedia/commons/thumb/2/2f/Google_2015_logo.svg/196px-Google_2015_logo.svg.png
Which function(or etc) is ideal so that these nicknames do not repeat on my parser. Dont know how to do that. I'l be very grateful if you help me.
Source:
from urllib.request import urlopen as uReq
from urllib.request import Request
from bs4 import BeautifulSoup as soup
# save all the nicknames to 'CSV' file format
filename = "BattlePassNicknames.csv"
f = open(filename, "a", encoding="utf-8")
headers1 = "Member of JAZE Battle Pass 2019\n"
b = 1
if b < 2:
f.write(headers1)
b += 1
# start page
i = 1
while True:
# disable jaze guard. turn off html 'mod_security'
link = 'https://jaze.ru/forum/topic?id=50&page='+str(i)
my_url = Request(
link,
headers={'User-Agent': 'Mozilla/5.0'}
)
i += 1 # increment page no for next run
uClient = uReq(my_url)
if uClient.url != link:
break
page_html = uClient.read()
# Check if there was a redirect
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
# grabs each name of player
containers = page_soup.findAll("div", {"class": "top-area"})
for container in containers:
playerName = container.div.a.text.strip()
print("BattlePass PlayerName: " + playerName)
f.write(playerName + "\n")
You can add all the names to a set.
A set object is an unordered collection of distinct hashable objects. Common uses include membership testing, removing duplicates from a sequence, and computing mathematical operations such as intersection, union, difference, and symmetric difference.
my_set = set()
# Lets add some elements to a set
my_set.add('a')
my_set.add('b')
print(my_set) # prints {'a', 'b'}
# Add one more 'a'
my_set.add('a')
print(my_set) # still prints {'a', 'b'} !
In your case, let's add all the names to a set and then write to the file after the for loop.
from urllib.request import urlopen as uReq
from urllib.request import Request
from bs4 import BeautifulSoup as soup
# save all the nicknames to 'CSV' file format
filename = "BattlePassNicknames.csv"
f = open(filename, "a", encoding="utf-8")
headers1 = "Member of JAZE Battle Pass 2019\n"
b = 1
if b < 2:
f.write(headers1)
b += 1
# start page
i = 1
names = set()
while True:
# disable jaze guard. turn off html 'mod_security'
link = 'https://jaze.ru/forum/topic?id=50&page='+str(i)
my_url = Request(
link,
headers={'User-Agent': 'Mozilla/5.0'}
)
i += 1 # increment page no for next run
uClient = uReq(my_url)
if uClient.url != link:
break
page_html = uClient.read()
# Check if there was a redirect
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
# grabs each name of player
containers = page_soup.findAll("div", {"class": "top-area"})
for container in containers:
playerName = container.div.a.text.strip()
names.add(playerName)
for name in names:
f.write(name + "\n")
f.close()
EDIT
Sets do not preserve the order. If you want to retain the order, just use lists.
...
names = []
while True:
...
for container in containers:
playerName = container.div.a.text.strip()
if playerName not in names:
names.append(playerName)
for name in names:
f.write(name + "\n")
f.close()
I am trying to pull a table from a list of URL's. When I only input one URL it only prints out the first items in the table and when I add more URL's to the list I get the error message 'list' object has no attribute 'timeout'. What is the best way to get the rest of the items and adding more URL's?
Below is the code I am running.
import time, random, csv, bs4, requests, io
import pandas as pd
timeDelay = random.randrange(5, 20)
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_urls = [
"https://www.lonza.com/products-services/bio-research/electrophoresis-of-nucleic-acids-and-proteins/nucleic-acid-electrophoresis/precast-gels-for-dna-and-rna-analysis/truband-gel-anchors.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-epithelial-cells/nucleofector-kits-for-human-mammary-epithelial-cells-hmec.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-neural-cells/nucleofector-kits-for-mammalian-glial-cells.aspx",
]
uClient = uReq(my_urls)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll('tbody')
product_name_list =[]
cat_no_list = []
size_list = []
price_list =[]
for container in containers:
if (len(container) > 0):
#try:
title_container = container.findAll('td')
Product_name = title_container[0].text.strip()
product_name_list.append(Product_name)
CatNo_container = container.findAll('td')
CatNo = CatNo_container[1].text.strip()
cat_no_list.append(CatNo)
#Size_container = container.findAll('div',{'class':'col-xs-2 noPadding'})
#Size = Size_container[0].text.strip()
#size_list.append(Size)
Price_container = container.findAll('td')
Price = Price_container[4].text.strip()
price_list.append(Price)
print('Product_name: '+ Product_name)
print('CatNo: ' + CatNo)
print('Size: ' + 'N/A')
print('Price: ' + Price)
print(" ")
time.sleep(timeDelay)
You are passing a list here, uClient = uReq(my_urls) as my_urls where a string is required.
You need to pass the individual element of the list i.e. the strings.
Here is the edited code that works for multiple urls.
UPDATED CODE (to get all items):
import time, random, csv, bs4, requests, io
import pandas as pd
timeDelay = random.randrange(5, 20)
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_urls = [
"https://www.lonza.com/products-services/bio-research/electrophoresis-of-nucleic-acids-and-proteins/nucleic-acid-electrophoresis/precast-gels-for-dna-and-rna-analysis/truband-gel-anchors.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-epithelial-cells/nucleofector-kits-for-human-mammary-epithelial-cells-hmec.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-neural-cells/nucleofector-kits-for-mammalian-glial-cells.aspx",
]
for url in my_urls:
print("URL using: ", url)
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll('tbody')
product_name_list =[]
cat_no_list = []
size_list = []
price_list =[]
for container in containers:
if (len(container) > 0):
#try:
items = container.findAll('tr')
for item in items:
item = item.text.split('\n')
Product_name = item[1]
product_name_list.append(Product_name)
CatNo = item[2]
cat_no_list.append(CatNo)
#Size_container = container.findAll('div',{'class':'col-xs-2 noPadding'})
#Size = Size_container[0].text.strip()
#size_list.append(Size)
Price = item[6]
price_list.append(Price)
print('Product_name: '+ Product_name)
print('CatNo: ' + CatNo)
print('Size: ' + 'N/A')
print('Price: ' + Price)
print(" ")
time.sleep(timeDelay)