python doesn't get page content - python-3.x

I've got the Python Beautifulsoup script below (adapted to python 3 from that script ).
It executes fine but nothing is returned in cmd.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
Newlines = re.compile(r'[\r\n]\s+')
def getPageText(url):
# given a url, get page content
data = urlopen(url).read()
# parse as html structured document
soup = BeautifulSoup(data, 'html.parser')
# kill javascript content
for s in soup.findAll('script'):
s.replaceWith('')
# find body and extract text
txt = soup.find('body').getText('\n')
# remove multiple linebreaks and whitespace
return Newlines.sub('\n', txt)
def main():
urls = [
'http://www.stackoverflow.com/questions/5331266/python-easiest-way-to-scrape-text-from-list-of-urls-using-beautifulsoup',
'http://stackoverflow.com/questions/5330248/how-to-rewrite-a-recursive-function-to-use-a-loop-instead'
]
txt = [getPageText(url) for url in urls]
if __name__=="__main__":
main()
Here's my cmd output
Microsoft Windows [Version 10.0..]
(c) Microsoft Corporation. All rights reserved.
C:\Users\user\Desktop\urls>python urls.py
C:\Users\user\Desktop\urls>
Why doesn't it return the pages contents?

Nothing is returned in cmd because there is no print statement in the code.
if you want to print out all the texts parsed from the given URL just use print function in main() function
def main():
urls = [
'http://www.stackoverflow.com/questions/5331266/python-easiest-way-to-scrape-text-from-list-of-urls-using-beautifulsoup',
'http://stackoverflow.com/questions/5330248/how-to-rewrite-a-recursive-function-to-use-a-loop-instead'
]
txt = [getPageText(url) for url in URLs]
for t in txt:
print(t)

Related

Why export to CSV only returns URLs but not text with Pandas dataframe?

I'm trying to get the Urls (lnk) and the paragraphs (txt) extract form python script below into a csv with pandas.
For some reason the generated csv returns the headers (lnk and txt) and the Urls only, but not the corresponding paragraphs.
The CSV file returns currently that
lnk | txt
url 1 |
url 2 |
What I need would be
lnk | txt
url 1 | text 1
url 2 | text 2
But both the Urls and the paragraphs do get printed in the cmd console.
Why doesn't the paragraphs get exported into the csv as well?
What would be a working fix to this problem? thanks.
(sorry for the long code, I'm new to Python)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
url_txt = []
#GET TEXT
def getPageText(url):
# given a url, get page content
data = urlopen(url).read()
# parse as html structured document
soup = BeautifulSoup(data, 'html.parser')
# kill javascript content
for s in soup(["script", "style"]):
s.replaceWith('')
#remove the text from this class
soup.find('p', {"class":"tiny-off"}).decompose()
#remove the text from this div id
soup.find('div', id = 'action-bar-top').decompose()
#remove the text from this div id
soup.find('div', id = 'main-content-sidebar').decompose()
#remove the text from this class
soup.find('div', {"class":"legal"}).decompose()
#get the 1st paragraph (which is a link)
for p in soup.find_all('p')[0]:
lnk = p.get_text()
print(lnk)
#remove the 1st paragraph (the link) from the following combined paragraphs
soup.find('p', id = 'current_url').decompose()
#extract all paragraphs save the 1st (the link)
for p in soup.find_all('p'):
txt = p.get_text().replace("\r", "").replace("\n", "")
print(txt)
# Compiling the info
lnktxt_data = [lnk, txt]
# Append the info to the complete dataset
url_txt.append(lnktxt_data)
#Get text from multiple urls
def main():
urls = [
'https://stackoverflow.com/questions/63400153/how-to-export-pandas-dataframe-into-csv-file', #dummy page
'https://stackoverflow.com/questions/52716762/how-to-join-newlines-into-a-paragraph-in-python' #dummy page
]
txt = [getPageText(url) for url in urls]
for t in txt:
print(t)
if __name__=="__main__":
main()
#FRAME DATA
# Making the dataframe
url_txt = pd.DataFrame(url_txt, columns = ['lnk', 'txt'])
url_txt.head()
#CREATE A FILE
# Save as CSV File
url_txt.to_csv('url_txt.csv',index=False)
I've found a simpler way that's working (with room for improvement),
with help from those two previous answers
How to join newlines into a paragraph in python
How to scrape web news and combine paragraphs into each article
Please let me know below how you would improve it if you find a better way.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
#GET TEXT
def getPageText(url):
# given a url, get page content
data = urlopen(url).read()
# parse as html structured document
soup = BeautifulSoup(data, 'html.parser')
# kill javascript content
for s in soup(["script", "style"]):
s.replaceWith('')
#
for p in soup.find_all('p')[1]:
lnk = p.get_text()
print(lnk)
#
# find body and extract text
p = soup.find("div", attrs={'class': 'article-content retro-folders'})
p.append(p.get_text())
x = p.text
y = x.replace("\r", "").replace("\n", "")
print(y)
# Compiling the info
lnktxt_data = [lnk, y]
# Append the info to the complete dataset
url_txt.append(lnktxt_data)
url_txt = []
#Get text from multiple urls
def main():
urls = [
'https://stackoverflow.com/questions/63400153/how-to-export-pandas-dataframe-into-csv-file', #dummy page
'https://stackoverflow.com/questions/52716762/how-to-join-newlines-into-a-paragraph-in-python' #dummy page
]
txt = [getPageText(url) for url in urls]
for t in txt:
print(t)
if __name__=="__main__":
main()
#FRAME DATA
# Making the dataframe
url_txt = pd.DataFrame(url_txt, columns = ['lnk', 'y'])
url_txt.head()
#CREATE A FILE
# Save as CSV File
url_txt.to_csv('url_txt.csv',index=False)

How to use BeautifulSoup to retrieve the URL of a tarfile

The following webpage contains all the source code URLs for the LFS project:
https://linuxfromscratch.org/lfs/view/systemd/chapter03/packages.html
I've wriiten some python3 code to retrieve all these URLs from that page:
#!/usr/bin/env python3
from requests import get
from bs4 import BeautifulSoup
import re
import sys, os
#url=sys.argv[1]
url="https://linuxfromscratch.org/lfs/view/systemd/chapter03/packages.html"
exts = (".xz", ".bz2", ".gz", ".lzma", ".tgz", ".zip")
response = get(url)
soup = BeautifulSoup(response.content, 'html.parser')
for link in soup.find_all('a', href=True):
if link.get('href'):
for anhref in link.get('href').split():
if os.path.splitext(anhref)[-1] in exts:
print((link.get('href')))
What I would like to do is input a pattern, say:
pattern = 'iproute2'
and then print the line that contains the iproute2 tarfile
which happens to be:
https://www.kernel.org/pub/linux/utils/net/iproute2/iproute2-5.12.0.tar.xz
I've tried using match = re.search(pattern, text) and it finds the correct line but if I print match I get:
<re.Match object; span=(43, 51), match='iproute2'>
How do I get it to print the actual URL?
You can use .string property (returns the string passed into the function).
Code Example
txt="https://www.kernel.org/pub/linux/utils/net/iproute2/iproute2-5.12.0.tar.xz"
pattern = 'iproute2'
match = re.search(pattern, txt)
if match: # this condition is used to avoid NoneType error
print(match.string)
else:
print('No Match Found')

Getting incorrect link on parsing web page in BeautifulSoup

I'm trying to get the download link from the button in this page. But when I open the download link that I get from my code I get this message
I noticed that if I manually click the button and open the link in a new page the csrfKey part of the link is always same whereas when I run the code I get a different key every time. Here's my code
from bs4 import BeautifulSoup
import requests
import re
def GetPage(link):
source_new = requests.get(link).text
soup_new = BeautifulSoup(source_new, 'lxml')
container_new = soup_new.find_all(class_='ipsButton')
for data_new in container_new:
#print(data_new)
headline = data_new # Display text
match = re.findall('download', str(data_new), re.IGNORECASE)
if(match):
print(f'{headline["href"]}\n')
if __name__ == '__main__':
link = 'https://eci.gov.in/files/file/10985-5-number-and-types-of-constituencies/'
GetPage(link)
Before you get to the actual download links of the files, you need to agree to Terms and Conditions. So, you need to fake this with requests and then parse the next page you get.
Here's how:
import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
link = 'https://eci.gov.in/files/file/10985-5-number-and-types-of-constituencies/'
with requests.Session() as connection:
r = connection.get("https://eci.gov.in/")
confirmation_url = BeautifulSoup(
connection.get(link).text, 'lxml'
).select_one(".ipsApp .ipsButton_fullWidth")["href"]
fake_agree_to_continue = connection.get(
confirmation_url.replace("?do=download", "?do=download&confirm=1")
).text
download_links = [
a["href"] for a in
BeautifulSoup(
fake_agree_to_continue, "lxml"
).select(".ipsApp .ipsButton_small")[1:]]
for download_link in download_links:
response = connection.get(download_link)
file_name = (
response
.headers["Content-Disposition"]
.replace('"', "")
.split(" - ")[-1]
)
print(f"Downloading: {file_name}")
with open(file_name, "wb") as f:
f.write(response.content)
This should output:
Downloading: Number And Types Of Constituencies.pdf
Downloading: Number And Types Of Constituencies.xls
And save two files: a .pdf and a .xls. The later one looks like this:

Writing multiple files as output when webscraping - python bs4

to preface - I am quite new to python and my HTML skills are kindergarten level.
So I am trying to save the quotes from this website which has many links in it for each member of the US Election candidates.
I have managed to get the actual code to extract the quotes (with the help of soem stackoverflow users), but am lost on how to write these quotes in to separate text files for each candidate.
For example, the first page, with all of Justin Amash's quotes should be written to a file: JustinAmash.txt.
The second page, with all of Michael Bennet's quotes should be written to MichaelBennet.txt (or something in that form). and so on.. Is there a way to do this?
For reference, to scrape the pages, the following code works:
import bs4
from urllib.request import Request,urlopen as uReq, HTTPError
#Import HTTPError in order to avoid the links with no content/resource of interest
from bs4 import BeautifulSoup as soup_
import re
#define url of interest
my_url = 'http://archive.ontheissues.org/Free_Trade.htm'
def make_soup(url):
# set up known browser user agent for the request to bypass HTMLError
req=Request(url,headers={'User-Agent': 'Mozilla/5.0'})
#opening up connection, grabbing the page
uClient = uReq(req)
page_html = uClient.read()
uClient.close()
#html is jumbled at the moment, so call html using soup function
soup = soup_(page_html, "lxml")
return soup
# Test: print title of page
#soup.title
soup = make_soup(my_url)
tags = soup.findAll("a" , href=re.compile("javascript:pop\("))
#print(tags)
# open a text file and write it if it doesn't exist
file1 = open("Quotefile.txt","w")
# get list of all URLS
for links in tags:
link = links.get('href')
if "java" in link:
print("http://archive.ontheissues.org" + link[18:len(link)-3])
main_url = "http://archive.ontheissues.org" + link[18:len(link)-3]
try:
sub_soup = make_soup(main_url)
content_collexn = sub_soup.body.contents #Splitting up the page into contents for iterative access
#text_data = [] #This list can be used to store data related to every person
for item in content_collexn:
#Accept an item if it belongs to the following classes
if(type(item) == str):
print(item.get_text())
elif(item.name == "h3"):
#Note that over here, every h3 tagged title has a string following it
print(item.get_text())
#Hence, grab that string too
print(item.next_sibling)
elif(item.name in ["p", "ul", "ol"]):
print(item.get_text())
except HTTPError: #Takes care of missing pages and related HTTP exception
print("[INFO] Resource not found. Skipping to next link.")
#print(text_data)
You can store that text data into the list you had started with text_data. Join all those items and then write to file:
So something like:
import bs4
from urllib.request import Request,urlopen as uReq, HTTPError
#Import HTTPError in order to avoid the links with no content/resource of interest
from bs4 import BeautifulSoup as soup_
import re
#define url of interest
my_url = 'http://archive.ontheissues.org/Free_Trade.htm'
def make_soup(url):
# set up known browser user agent for the request to bypass HTMLError
req=Request(url,headers={'User-Agent': 'Mozilla/5.0'})
#opening up connection, grabbing the page
uClient = uReq(req)
page_html = uClient.read()
uClient.close()
#html is jumbled at the moment, so call html using soup function
soup = soup_(page_html, "lxml")
return soup
# Test: print title of page
#soup.title
soup = make_soup(my_url)
tags = soup.findAll("a" , href=re.compile("javascript:pop\("))
#print(tags)
# open a text file and write it if it doesn't exist
#file1 = open("Quotefile.txt","w")
# get list of all URLS
candidates = []
for links in tags:
link = links.get('href')
if "java" in link:
#print("http://archive.ontheissues.org" + link[18:len(link)-3])
main_url = "http://archive.ontheissues.org" + link[18:len(link)-3]
candidate = link.split('/')[-1].split('_Free_Trade')[0]
if candidate in candidates:
continue
else:
candidates.append(candidate)
try:
sub_soup = make_soup(main_url)
content_collexn = sub_soup.body.contents #Splitting up the page into contents for iterative access
text_data = [] #This list can be used to store data related to every person
for item in content_collexn:
#Accept an item if it belongs to the following classes
if(type(item) == str):
#print(item.get_text())
text_data.append(item.get_text())
elif(item.name == "h3"):
#Note that over here, every h3 tagged title has a string following it
#print(item.get_text())
text_data.append(item.get_text())
#Hence, grab that string too
#print(item.next_sibling)
text_data.append(item.next_sibling)
elif(item.name in ["p", "ul", "ol"]):
#print(item.get_text())
text_data.append(item.get_text())
except HTTPError: #Takes care of missing pages and related HTTP exception
print("[INFO] Resource not found. Skipping to next link.")
candidates.remove(candidate)
continue
text_data = '\n'.join(text_data)
with open("C:/%s.txt" %(candidate), "w") as text_file:
text_file.write(text_data)
print('Aquired: %s' %(candidate))

how to define parser when using BS4 in python

#!/usr/bin/env python
import requests
from bs4 import BeautifulSoup
url = "https://www.youtube.com/channel/UCaKt8dvEIPnEHWSbLYhzrxg/videos"
response = requests.get(url)
# parse html
page = str(BeautifulSoup(response.content))
def getURL(page):
"""
:param page: html of web page (here: Python home page)
:return: urls in that page
"""
start_link = page.find("a href")
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1: end_quote]
return url, end_quote
while True:
url, n = getURL(page)
page = page[n:]
if url:
print(url)
else:
break
I am using above code to get list of all youtube videos on webpage. If i try to do this. I get following error
The code that caused this warning is on line 9 of the file C:/Users/PycharmProjects/ReadCSVFile/venv/Links.py. To get rid of this warning, change code that looks like this:
I did and started using html but some different error came .
I am using Python 3.0 . I am using IDE Pycharm.
Can someone please help me this.
its not error, but warning you didn't set parser which can be 'html.parser', 'lxml', 'xml'. change it to like
page = BeautifulSoup(response.content, 'html.parser')
your code above actually not doing what BeautifulSoup do, but here the example using it.
#!/usr/bin/env python
import requests
from bs4 import BeautifulSoup
def getURL(url):
"""
:param url: url of web page
:return: urls in that page
"""
response = requests.get(url)
# parse html
page = BeautifulSoup(response.content, 'html.parser')
link_tags = page.find_all('a')
urls = [x.get('href') for x in link_tags]
return urls
url = "https://www.youtube.com/channel/UCaKt8dvEIPnEHWSbLYhzrxg/videos"
all_url = getURL(url)
print('\n'.join(all_url))

Resources