BeautifulSoup4 not findall() not getting all of the links on the webpage - python-3.x

I am trying to grab all of the 'a' links from a webpage:
from bs4 import BeautifulSoup
import requests
source_code = requests.get(starting_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for link in soup.findAll('a'):
href = link.get('href')
print(href)
and the list printed out it not all links on the page. if I try and print out plain_text, I can sea all these links, but they are not printed as href.
First week learning python! All help is greatly appreciated. Thanks!
Update: I forgot to share the plaint_text file here. Sorry for the confusion.
The plain_text is pretty long so I'll just post the starting_url
starting_url = 'https://freeexampapers.com/index.php?option=com_content&view=article&id=1&Itemid=101&jsmallfib=1&dir=JSROOT/IB'
and yes I'm a high school student:-)

Since you have not given any data sample we can give you sample that
you could try :-
soup = BeautifulSoup(html_page,"html.parser")
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
print link.get('href')

This should do it.
import re
import requests
from bs4 import BeautifulSoup
import os
import fileinput
Link = 'https://animetosho.org/view/jacobswaggedup-kill-la-kill-bd-1280x720-mp4-batch.n677876'
q = requests.get(Link)
soup = BeautifulSoup(q.text)
#print soup
subtitles = soup.findAll('div',{'class':'links'})
#print subtitles
with open("Anilinks.txt", "w") as f:
for link in subtitles:
x = link.find_all('a', limit=26)
for a in x:
url = a['href']
f.write(url+'\n')
Now, if you want to do something like store the links in a text file, do the following.
# Store the links we need in a list
links_to_keep = []
with open("Anilinks.txt", "r") as f:
for line in f.readlines():
if 'solidfiles.com' in line:
links_to_keep.append(line)
# Write all the links in our list to the file
with open("Anilinks.txt", "w") as f:
for link in links_to_keep:
f.write(link)

Related

web scraping from news articles

I have been trying to access the links from a given news website. I've found the code which works really well, but the only issue is that, it outputs "javascript:void();" along with all the other links. Please let me know what changes I can make such that I don't encounter "javascript:void();" in the output with all the other links.
The following is the code:
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
import requests
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = requests.get("https://www.ndtv.com/coronavirus?pfrom=home-mainnavgation")
http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
encoding = html_encoding or http_encoding
soup = BeautifulSoup(resp.content, parser, from_encoding=encoding)
for link in soup.find_all('a', href=True):
print(link['href'])
If you don't want them, just filter them out.
Here's how:
import requests
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
resp = requests.get("https://www.ndtv.com/coronavirus?pfrom=home-mainnavgation")
http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
encoding = html_encoding or http_encoding
soup = BeautifulSoup(resp.content, 'html.parser', from_encoding=encoding)
for link in soup.find_all('a', href=True):
if link["href"] != "javascript:void();":
print(link['href'])

How to scrape multiple pages with requests in python

recently started getting into web scraping and i have managed ok but now im stuck and i cant find the answer or figure it out.
Here is my code for scraping and exporting info from a single page
import requests
page = requests.get("https://www.example.com/page.aspx?sign=1")
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')
#finds the right heading to grab
box = soup.find('h1').text
heading = box.split()[0]
#finds the right paragraph to grab
reading = soup.find_all('p')[0].text
print (heading, reading)
import csv
from datetime import datetime
# open a csv file with append, so old data will not be erased
with open('index.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([heading, reading, datetime.now()])
Problem occurs when i try to scrape multiple pages at the same time.
They are all the same just pagination changes eg
https://www.example.com/page.aspx?sign=1
https://www.example.com/page.aspx?sign=2
https://www.example.com/page.aspx?sign=3
https://www.example.com/page.aspx?sign=4 etc
Instead of writing the same code 20 times how do i stick all the data in a tuple or an array and export to csv.
Many thanks in advance.
Just try it out with a loop, until you got no page available (request is not OK). Should be easy to get.
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
results = []
page_number = 1
while True:
response = requests.get(f"https://www.example.com/page.aspx?sign={page_number}")
if response.status_code != 200:
break
soup = BeautifulSoup(page.content, 'html.parser')
#finds the right heading to grab
box = soup.find('h1').text
heading = box.split()[0]
#finds the right paragraph to grab
reading = soup.find_all('p')[0].text
# write a list
# results.append([heading, reading, datetime.now()])
# or tuple.. your call
results.append((heading, reading, datetime.now()))
page_number = page_number + 1
with open('index.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
for result in results:
writer.writerow(result)

Extracting a table from Webpage in Python

I have tried reading a table from a website. It can be seen from my code that I have gone too far to get the table, I would appreciate if someone give me an opportunity to learn a quick method to do the same.
Here's my code:
import urllib.request
from bs4 import BeautifulSoup
url = "http://www.kazusa.or.jp/codon/cgi-bin/showcodon.cgi?species=9606&aa=1&style=N"
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html)
text = soup.get_text()
with open('myfile.txt', 'w') as file:
file.writelines(text)
with open('myfile.txt','r') as g:
f = g.readlines()
tab = f[12:31]
table = [x.strip() for x in tab]
Every time running the code messes up with writing and reading the file.
You shouldn't need files. Filter for the pre tag instead, to target the table alone.
soup = BeautifulSoup(html)
text=soup.find('pre')
table = [x.strip() for x in text]

Python: Scraping links into a CSV

I am relatively new to Python. I am trying to scrape url's from a site and write them to a csv file. I have been able to print the urls, however, I have been unable to write them or store them anywhere. Any help?
import requests
import csv
from bs4 import BeautifulSoup
url = 'http://comm.eval.org/communities/resources/libraryview?LibraryKey=1eff4fd7-afa0-42e1-b275-f65881b7489b'
r=requests.get(url)
html_url = r.text
soup = BeautifulSoup(html_url, "html.parser")
with open('output.csv', 'wb') as f:
bsoup_writer = csv.writer(f)
for link in soup.find_all('a'):
bsoup_writer.writerow([link.get('href'), link.get('class'), link, get('id')])
This should do what you're looking for:
with open('output.csv', 'wb') as f:
bsoup_writer = csv.writer(f)
for link in soup.find_all('a'):
bsoup_writer.writerow([link.get('href'), link.get('class'), link.get('id')])
Be sure to include the following csv import at the top of your script:
import csv

HTML confusion with Python BeautifulSoup

I'm following thenewboston's tutorials on youtube and after compiling my code I get no errors.
I'm trying to print the "Generic Line List" and all the links following that list;can be found at the bottom of this link
http://playrustwiki.com/wiki/List_of_Items
import requests
from bs4 import BeautifulSoup
def trade_spider(max_pages):
page = 1
while page <= max_pages: #makes our pages change everytime
url = 'http://playrustwiki.com/wiki/List_of_Items' + str(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text) #find all the links in soup or all the titles
for link in soup.findAll('a', {'class': 'a href'}): #links are a for anchors in HTML
href = link.get('href') # href attribute
print(href)
page += 1
trade_spider(1)
I've tried different HTML attributes but I think thats where my confusion starts. I can't find the correct attribute to call for my scraper or I'm calling the wrong attribute.
Please help~
Thanks :)
The idea here would be to find the element that has Generic line list text. Then, find the next ul sibling via find_next_sibling() and get all links inside via find_all():
h3 = soup.find('h3', text='Generic Line List')
generic_line_list = h3.find_next_sibling('ul')
for link in generic_line_list.find_all('a', href=True):
print(link['href'])
Demo:
>>> import requests
>>> from bs4 import BeautifulSoup
>>>
>>> url = 'http://playrustwiki.com/wiki/List_of_Items'
>>> soup = BeautifulSoup(requests.get(url).content)
>>>
>>> h3 = soup.find('h3', text='Generic Line List')
>>> generic_line_list = h3.find_next_sibling('ul')
>>> for link in generic_line_list.find_all('a', href=True):
... print(link['href'])
...
/wiki/Wood_Barricade
/wiki/Wood_Shelter
...
/wiki/Uber_Hunting_Bow
/wiki/Cooked_Chicken_Breast
/wiki/Anti-Radiation_Pills

Resources