How can I extract the links from HTML?

How can I extract the links from HTML? - python-3.x

I'm trying to get a link of every article in this category on the SF chronicle but I'm not sure as to where I should begin on extracting the URLs. Here is my progress so far:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.sfchronicle.com/local/'
# opening up connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
zone2_container = page_soup.findAll("div",{"class":"zone zone-2"})
zone3_container = page_soup.findAll("div",{"class":"zone zone-3"})
zone4_container = page_soup.findAll("div",{"class":"zone zone-4"})
right_rail_container = page_soup.findAll("div",{"class":"right-rail"})
All of the links I want are located in zone2-4_container and right_rail_container.

You can use the following code to get all links:
all_zones = [zone2_container, zone3_container, zone4_container, right_rail_container]
urls = []
for i in all_zones:
links = i[0].findAll('a')
for link in links:
urls.append(link['href'])
I have merged all the lists in one list but you can also define a function to achieve the same.
def get_urls(zone):
urls = []
for i in zone:
links = i.findAll('a')
for link in links:
urls.append(link['href'])
return urls
get_urls(zone2_container)

It now sounds like you basically want all the article links, in which case you can use an attribute = value css selector with contains operator to target href attributes whose value contains the substring 'article'.
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
base = 'https://www.sfchronicle.com/'
url = 'https://www.sfchronicle.com/local/'
res = requests.get(url)
soup = bs(res.content, 'lxml')
links = [urljoin(base,link['href']) for link in soup.select('[href*=article]')]
print(links)
print(len(links))

Related

How to only retrieve the tag I specify using BeautifulSoup

I just want the written text out of this website: https://algorithms-tour.stitchfix.com/ so I can put it in Word doc and read it.
When I run the code, I get all the html and the tags, at the very end I get what I want, but I just want to separate the text.
import requests
from bs4 import BeautifulSoup
url = "https://algorithms-tour.stitchfix.com"
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
item = soup.find_all("p")
print(item)
Is there a way to get just content so I can clean it up some more?

You have a few options for this. If you only want text found within p tags, you can do this:
import requests
from bs4 import BeautifulSoup
url = "https://algorithms-tour.stitchfix.com"
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
items = soup.find_all("p")
result = []
for item in items:
result.append(item.string)
print(result)
Note that soup.find_all returns an iterable list, and not a single object.
An alternative, and easier method is to just use soup.get_text:
import requests
from bs4 import BeautifulSoup
url = "https://algorithms-tour.stitchfix.com"
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
print(soup.get_text())

How to scrape from web all children of an attribute with one class?

I have tried to get the highlighted area (in the screenshot) in the website using BeautifulSoup4, but I cannot get what I want. Maybe you have a recommendation doing it with another way.
Screenshot of the website I need to get data from
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
import urllib
import csv
import html5lib
urls = ['https://e-mehkeme.gov.az/Public/Cases?page=1',
'https://e-mehkeme.gov.az/Public/Cases?page=2'
]
# scrape elements
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
content = soup.findAll("input", class_="casedetail filled")
print(content)
My expected output is like this:
Ətraflı məlumat:
İşə baxan hakim və ya tərkib
Xəyalə Cəmilova - sədrlik edən hakim
İlham Kərimli - tərkib üzvü
İsmayıl Xəlilov - tərkib üzvü
Tərəflər
Cavabdeh: MAHMUDOV MAQSUD SOLTAN OĞLU
Cavabdeh: MAHMUDOV MAHMUD SOLTAN OĞLU
İddiaçı: QƏHRƏMANOVA AYNA NUĞAY QIZI
İşin mahiyyəti
Mənzil mübahisələri - Mənzildən çıxarılma

Using the base url first get all the caseid and then pass those caseid to target url and then get the value of the first td tag.
import requests
from bs4 import BeautifulSoup
urls = ['https://e-mehkeme.gov.az/Public/Cases?page=1',
'https://e-mehkeme.gov.az/Public/Cases?page=2'
]
target_url="https://e-mehkeme.gov.az/Public/CaseDetail?caseId={}"
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
for caseid in soup.select('input.casedetail'):
#print(caseid['value'])
soup1=BeautifulSoup(requests.get(target_url.format(caseid['value'])).content,'html.parser')
print(soup1.select_one("td").text)

I would write it this way. Extracting the id that needs to be put in GET request for detailed info
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://e-mehkeme.gov.az/Public/Cases?page=1','https://e-mehkeme.gov.az/Public/Cases?page=2']
def get_soup(url):
r = s.get(url)
soup = bs(r.content, 'lxml')
return soup
with requests.Session() as s:
for url in urls:
soup = get_soup(url)
detail_urls = [f'https://e-mehkeme.gov.az/Public/CaseDetail?caseId={i["value"]}' for i in soup.select('.caseId')]
for next_url in detail_urls:
soup = get_soup(next_url)
data = [string for string in soup.select_one('[colspan="4"]').stripped_strings]
print(data)

BeautifulSoup python: Get the text with no tags and get the adjacent links

I am trying to extract the movie titles and links for it from this site
from bs4 import BeautifulSoup
from requests import get
link = "https://tamilrockerrs.ch"
r = get(link).content
#r = open('json.html','rb').read()
b = BeautifulSoup(r,'html5lib')
a = b.findAll('p')[1]
But the problem is there is no tag for the titles. I can't extract the titles and if I could do that how can I bind the links and title together.
Thanks in Advance

You can find title and link by this way.
from bs4 import BeautifulSoup
import requests
url= "http://tamilrockerrs.ch"
response= requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'html.parser')
data = soup.find_all('div', {"class":"title"})
for film in data:
print("Title:", film.find('a').text) # get the title here
print("Link:", film.find('a').get("href")) #get the link here

How to only retrieve links from a list in Python3? [Beginner]

I have list on python3 that goes something like this:
https://textuploader.com/15dra
From this file I want to make a new list that only takes the urls from the other list seperated by commas and contained in double quotes ("), and if possible also filter all urls that contain "i.redd.it"
Here is the code if it helps:
from bs4 import BeautifulSoup
import requests
import re
import urllib.request
import urllib3
http = urllib3.PoolManager()
url = "https://reddit.com/r/me_irl"
response = http.request('GET', url)
soup = BeautifulSoup(response.data, "lxml")
tags = soup.find_all('a')
hrefs = []
for t in tags:
hrefs.append(t)
print(hrefs)

You could do a list comprehension. I would also include this line:
tags = soup.find_all('a', href=True)
since you only want the tags with an url
from bs4 import BeautifulSoup
import requests
import re
import urllib.request
import urllib3
http = urllib3.PoolManager()
url = "https://reddit.com/r/me_irl"
response = http.request('GET', url)
soup = BeautifulSoup(response.data, "lxml")
tags = soup.find_all('a', href=True)
hrefs = [ ele['href'] for ele in tags if 'i.redd.it' in ele['href']]
However, this will return an empty list, as there are no hrefs in there that contain 'i.redd.it'
But if you want the urls, you could get rid of the if statement, or change it if you'd like:
hrefs = [ ele['href'] for ele in tags ]

How can I get the href of anchor tag using Beautiful Soup?

I am trying to get the href of anchor tag of the very first video search on YouTube using Beautiful Soup. I am searching it by using the "a" and class_="yt-simple-endpoint style-scope ytd-video-renderer".
But I am getting None output:
from bs4 import BeautifulSoup
import requests
source = requests.get("https://www.youtube.com/results?search_query=MP+election+results+2018%3A+BJP+minister+blames+conspiracy+as+reason+while+losing").text
soup = BeautifulSoup(source,'lxml')
# print(soup2.prettify())
a =soup.findAll("a", class_="yt-simple-endpoint style-scope ytd-video-renderer")
a_fin = soup.find("a", class_="compact-media-item-image")
#
print(a)

from bs4 import BeautifulSoup
import requests
source = requests.get("https://www.youtube.com/results?search_query=MP+election+results+2018%3A+BJP+minister+blames+conspiracy+as+reason+while+losing").text
soup = BeautifulSoup(source,'lxml')
first_serach_result_link = soup.findAll('a',attrs={'class':'yt-uix-tile-link'})[0]['href']
heavily inspired by
this answer

Another option is to render the page first with Selenium.
import bs4
from selenium import webdriver
url = 'https://www.youtube.com/results?search_query=MP+election+results+2018%3A+BJP+minister+blames+conspiracy+as+reason+while+losing'
browser = webdriver.Chrome('C:\chromedriver_win32\chromedriver.exe')
browser.get(url)
source = browser.page_source
soup = bs4.BeautifulSoup(source,'html.parser')
hrefs = soup.find_all("a", class_="yt-simple-endpoint style-scope ytd-video-renderer")
for a in hrefs:
print (a['href'])
Output:
/watch?v=Jor09n2IF44
/watch?v=ym14AyqJDTg
/watch?v=g-2V1XJL0kg
/watch?v=eeVYaDLC5ik
/watch?v=StI92Bic3UI
/watch?v=2W_4LIAhbdQ
/watch?v=PH1WZPT5IKw
/watch?v=Au2EH3GsM7k
/watch?v=q-j1HEnDn7w
/watch?v=Usjg7IuUhvU
/watch?v=YizmwHibomQ
/watch?v=i2q6Fm0E3VE
/watch?v=OXNAMyEvcH4
/watch?v=vdcBtAeZsCk
/watch?v=E4v2StDdYqs
/watch?v=x7kCuRB0f7E
/watch?v=KERtHNoZrF0
/watch?v=TenbA4wWIJA
/watch?v=Ey9HfjUyUvY
/watch?v=hqsuOT0URJU

It dynamic html you can use Selenium or to get static html use GoogleBot user-agent
headers = {'User-Agent' : 'Googlebot/2.1 (+http://www.google.com/bot.html)'}
source = requests.get("https://.......", headers=headers).text
soup = BeautifulSoup(source, 'lxml')
links = soup.findAll("a", class_="yt-uix-tile-link")
for link in links:
print(link['href'])

Try looping over the matches:
import urllib2
data = urllib2.urlopen("some_url")
html_data = data.read()
soup = BeautifulSoup(html_data)
for a in soup.findAll('a',href=True):
print a['href']

The class which you're searching does not exist in the scraped html. You can identify it by printing the soup variable.
For example:
a =soup.findAll("a", class_="sign-in-link")
gives output as:
[<a class="sign-in-link" href="https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Faction_handle_signin%3Dtrue%26app%3Ddesktop%26feature%3Dplaylist%26hl%3Den%26next%3D%252Fresults%253Fsearch_query%253DMP%252Belection%252Bresults%252B2018%25253A%252BBJP%252Bminister%252Bblames%252Bconspiracy%252Bas%252Breason%252Bwhile%252Blosing&uilel=3&hl=en&service=youtube">Sign in</a>]

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

How can I extract the links from HTML? - python-3.x

Related

How to only retrieve the tag I specify using BeautifulSoup

How to scrape from web all children of an attribute with one class?

BeautifulSoup python: Get the text with no tags and get the adjacent links

How to only retrieve links from a list in Python3? [Beginner]

How can I get the href of anchor tag using Beautiful Soup?

Categories

Resources