I've written this script to download images from a subreddit.
# A script to download pictures from reddit.com/r/HistoryPorn
from urllib.request import urlopen
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
import re
import os
import sys #TODO: sys.argv
print('Downloading images...')
# Create a directory for photographs
path_to_hist = '/home/tautvydas/Documents/histphoto'
os.chdir(path_to_hist)
if not os.path.exists('/home/tautvydas/Documents/histphoto'):
os.mkdir(path_to_hist)
website = 'https://www.reddit.com/r/HistoryPorn'
# Go to the internet and connect to the subreddit, start a loop
for i in range(3):
subreddit = urlopen(website)
bs_subreddit = BeautifulSoup(subreddit, 'lxml')
# Create a regex and find all the titles in the page
remove_reddit_tag = re.compile('(\s*\(i.redd.it\)(\s*))')
title_bs_subreddit = bs_subreddit.findAll('p', {'class': 'title'})
# Get text off the page
pic_name = []
for item in title_bs_subreddit[1:]:
item = item.get_text()
item = remove_reddit_tag.sub('', item)
pic_name.append(item)
# Get picture links
pic_bs_subreddit = bs_subreddit.findAll('div', {'data-url' : re.compile('.*')})
pic_img = []
for pic in pic_bs_subreddit[1:]:
pic_img.append(pic['data-url'])
# Zip all info into one
name_link = zip(pic_name, pic_img)
for i in name_link:
urlretrieve(i[1],i[0])
# Click next
for link in bs_subreddit.find('span', {'class' : 'next-button'}).children:
website = link['href']
However I get this FileNotFoundError.
Downloading images...
Traceback (most recent call last):
File "gethist.py", line 44, in <module>
urlretrieve(i[1],i[0])
File "/home/tautvydas/anaconda3/lib/python3.6/urllib/request.py", line 258, in urlretrieve
tfp = open(filename, 'wb')
FileNotFoundError: [Errno 2] No such file or directory: 'Preparation of rocket carrying test instruments, Kauai. June 29, 1962 [2880x1620] https://www.topic.com/a-crimson-fracture-in-the-sky'
What could be the problem? The link in 'data-url' is retrieved fine and works if clicked. Could this be a problem that a name contains a hyperlink? Or the name too long? Because up till that image all other images are downloaded without any issues.
The issue here is related to the names collected : they contain the source of the picture as an url string, and it is misinterpreted like a folder path.
You would need to clean the text to avoid special annoying characters and maybe make them a bit shorter, but i suggest to change the pattern too, to ensure the results, you could parse only the <a> tags that contain the title, not the whole <p> which hold the link too.
Also, instead of building a zip with two different loops, you can create one list of the main blocks by searching the class thing (equivalent to findAll('div', {'data-url' : re.compile('.*')), and then use this list to perform relative queries on each block to find the title and the url.
[...]
remove_reddit_tag = re.compile('(\s*\(i.redd.it\)(\s*))')
name_link = []
for block in bs_subreddit.findAll('div', {'class': 'thing'})[1:]:
item = block.find('a',{'class': 'title'}).get_text()
title = remove_reddit_tag.sub('', item)[:100]
url = block.get('data-url')
name_link.append((title, url))
print(url, title)
for title, url in name_link:
urlretrieve(url, title)
Related
I am working on a project where I need to create a movie database.
I have created my database and imported the links from IMDB that redirect you to the webpage. I would like to add also, the main image/thumbnail of each movie so that I can use then the csv in Power BI.
However, I did not manage to do it:
I have tried this:
import requests
from bs4 import BeautifulSoup
import numpy as np
images = []
for i in df_database_url['Url Film']:
r = requests.get(i)
soup = BeautifulSoup(r.content, "html.parser")
images.append(image_url)
But my goal is to have a column that includes the thumbnail for each movie.
Assuming that i is an imdb movie url (the kind that starts with https://www.imdb.com/title), you can target the script tag that seems to contain a lot of the main information for the movie - you can get that with
# import json
image_url = json.loads(soup.select_one('script[type="application/ld+json"]').text)['image']
or, if we're more cautious:
# import json
scCont = [s.text for s in soup.select('script[type="application/ld+json"]') if '"image"' in s.text]
if scCont:
try:
scCont = json.loads(scCont[0])
if 'image' not in scCont:
image_url = None
print('No image found for', i)
else: image_url = scCont['image']
except Exception as e:
image_url = None
print('Could not parse movie info for', i, '\n', str(e))
else:
image_url = None
print('Could not find script with movie info for', i)
(and you can get the trailer thumbnail with scCont['trailer']['thumbnailUrl'])
This way, instead of raising an error if anything on the path to the expected info is unavailable, it will just add image_url as None; if you want it to halt and raise error in such cases, use the first version.
and then after the loop you can add in the column with something like
df_database_url['image_urls'] = images
(you probably know that...)
I'm trying to get the download link from the button in this page. But when I open the download link that I get from my code I get this message
I noticed that if I manually click the button and open the link in a new page the csrfKey part of the link is always same whereas when I run the code I get a different key every time. Here's my code
from bs4 import BeautifulSoup
import requests
import re
def GetPage(link):
source_new = requests.get(link).text
soup_new = BeautifulSoup(source_new, 'lxml')
container_new = soup_new.find_all(class_='ipsButton')
for data_new in container_new:
#print(data_new)
headline = data_new # Display text
match = re.findall('download', str(data_new), re.IGNORECASE)
if(match):
print(f'{headline["href"]}\n')
if __name__ == '__main__':
link = 'https://eci.gov.in/files/file/10985-5-number-and-types-of-constituencies/'
GetPage(link)
Before you get to the actual download links of the files, you need to agree to Terms and Conditions. So, you need to fake this with requests and then parse the next page you get.
Here's how:
import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
link = 'https://eci.gov.in/files/file/10985-5-number-and-types-of-constituencies/'
with requests.Session() as connection:
r = connection.get("https://eci.gov.in/")
confirmation_url = BeautifulSoup(
connection.get(link).text, 'lxml'
).select_one(".ipsApp .ipsButton_fullWidth")["href"]
fake_agree_to_continue = connection.get(
confirmation_url.replace("?do=download", "?do=download&confirm=1")
).text
download_links = [
a["href"] for a in
BeautifulSoup(
fake_agree_to_continue, "lxml"
).select(".ipsApp .ipsButton_small")[1:]]
for download_link in download_links:
response = connection.get(download_link)
file_name = (
response
.headers["Content-Disposition"]
.replace('"', "")
.split(" - ")[-1]
)
print(f"Downloading: {file_name}")
with open(file_name, "wb") as f:
f.write(response.content)
This should output:
Downloading: Number And Types Of Constituencies.pdf
Downloading: Number And Types Of Constituencies.xls
And save two files: a .pdf and a .xls. The later one looks like this:
to preface - I am quite new to python and my HTML skills are kindergarten level.
So I am trying to save the quotes from this website which has many links in it for each member of the US Election candidates.
I have managed to get the actual code to extract the quotes (with the help of soem stackoverflow users), but am lost on how to write these quotes in to separate text files for each candidate.
For example, the first page, with all of Justin Amash's quotes should be written to a file: JustinAmash.txt.
The second page, with all of Michael Bennet's quotes should be written to MichaelBennet.txt (or something in that form). and so on.. Is there a way to do this?
For reference, to scrape the pages, the following code works:
import bs4
from urllib.request import Request,urlopen as uReq, HTTPError
#Import HTTPError in order to avoid the links with no content/resource of interest
from bs4 import BeautifulSoup as soup_
import re
#define url of interest
my_url = 'http://archive.ontheissues.org/Free_Trade.htm'
def make_soup(url):
# set up known browser user agent for the request to bypass HTMLError
req=Request(url,headers={'User-Agent': 'Mozilla/5.0'})
#opening up connection, grabbing the page
uClient = uReq(req)
page_html = uClient.read()
uClient.close()
#html is jumbled at the moment, so call html using soup function
soup = soup_(page_html, "lxml")
return soup
# Test: print title of page
#soup.title
soup = make_soup(my_url)
tags = soup.findAll("a" , href=re.compile("javascript:pop\("))
#print(tags)
# open a text file and write it if it doesn't exist
file1 = open("Quotefile.txt","w")
# get list of all URLS
for links in tags:
link = links.get('href')
if "java" in link:
print("http://archive.ontheissues.org" + link[18:len(link)-3])
main_url = "http://archive.ontheissues.org" + link[18:len(link)-3]
try:
sub_soup = make_soup(main_url)
content_collexn = sub_soup.body.contents #Splitting up the page into contents for iterative access
#text_data = [] #This list can be used to store data related to every person
for item in content_collexn:
#Accept an item if it belongs to the following classes
if(type(item) == str):
print(item.get_text())
elif(item.name == "h3"):
#Note that over here, every h3 tagged title has a string following it
print(item.get_text())
#Hence, grab that string too
print(item.next_sibling)
elif(item.name in ["p", "ul", "ol"]):
print(item.get_text())
except HTTPError: #Takes care of missing pages and related HTTP exception
print("[INFO] Resource not found. Skipping to next link.")
#print(text_data)
You can store that text data into the list you had started with text_data. Join all those items and then write to file:
So something like:
import bs4
from urllib.request import Request,urlopen as uReq, HTTPError
#Import HTTPError in order to avoid the links with no content/resource of interest
from bs4 import BeautifulSoup as soup_
import re
#define url of interest
my_url = 'http://archive.ontheissues.org/Free_Trade.htm'
def make_soup(url):
# set up known browser user agent for the request to bypass HTMLError
req=Request(url,headers={'User-Agent': 'Mozilla/5.0'})
#opening up connection, grabbing the page
uClient = uReq(req)
page_html = uClient.read()
uClient.close()
#html is jumbled at the moment, so call html using soup function
soup = soup_(page_html, "lxml")
return soup
# Test: print title of page
#soup.title
soup = make_soup(my_url)
tags = soup.findAll("a" , href=re.compile("javascript:pop\("))
#print(tags)
# open a text file and write it if it doesn't exist
#file1 = open("Quotefile.txt","w")
# get list of all URLS
candidates = []
for links in tags:
link = links.get('href')
if "java" in link:
#print("http://archive.ontheissues.org" + link[18:len(link)-3])
main_url = "http://archive.ontheissues.org" + link[18:len(link)-3]
candidate = link.split('/')[-1].split('_Free_Trade')[0]
if candidate in candidates:
continue
else:
candidates.append(candidate)
try:
sub_soup = make_soup(main_url)
content_collexn = sub_soup.body.contents #Splitting up the page into contents for iterative access
text_data = [] #This list can be used to store data related to every person
for item in content_collexn:
#Accept an item if it belongs to the following classes
if(type(item) == str):
#print(item.get_text())
text_data.append(item.get_text())
elif(item.name == "h3"):
#Note that over here, every h3 tagged title has a string following it
#print(item.get_text())
text_data.append(item.get_text())
#Hence, grab that string too
#print(item.next_sibling)
text_data.append(item.next_sibling)
elif(item.name in ["p", "ul", "ol"]):
#print(item.get_text())
text_data.append(item.get_text())
except HTTPError: #Takes care of missing pages and related HTTP exception
print("[INFO] Resource not found. Skipping to next link.")
candidates.remove(candidate)
continue
text_data = '\n'.join(text_data)
with open("C:/%s.txt" %(candidate), "w") as text_file:
text_file.write(text_data)
print('Aquired: %s' %(candidate))
I am new to Python programming, trying to do web scraping just for learning using Beautifulsoup, applying an iterator using FOR Loop, but I guess it is running only for the one time, and the next time it is showing me some error, tried a lot but was not able to resolve.
Below is my Code -
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = 'https://www.packtpub.com/all'
page = urlopen(url)
soup_packtpage = BeautifulSoup(page,'lxml')
page.close()
all_book = soup_packtpage.find_all("div",class_='book-block-outer')
for book_title in all_book:
title = book_title.div['data-product-title']
price = book_title.div['data-product-price']
category = book_title.div['data-product-category']
print(title)
print("Rs:-"+ price)
print(category)
and below is the output -
Learn Algorithms and Data Structures in Java for Day-to-Day Applications [Video]
Rs:-199.44
Application Development
Traceback (most recent call last):
File "/home/bhagwatanimesh/PycharmProjects/packet_pub/packet_pub", line 17, in
title = book_title.div['data-product-title']
File "/home/bhagwatanimesh/.local/lib/python3.5/site-packages/bs4/element.py", line 1011, in getitem
return self.attrs[key]
KeyError: 'data-product-title'
It seems like, You are trying to access a key which is not present in the dictionary.
For solving this you may use below code.
for book_title in all_book:
try:
title = book_title.div['data-product-title']
price = book_title.div['data-product-price']
category = book_title.div['data-product-category']
print(title)
print("Rs:-"+ price)
print(category)
except:
continue
I have used one of the methods described here Python write to CSV line by line to attempt to write all the lines of my output to a .CSV. I've managed to get it to the stage of outputting and generating the CSV, but instead of showing all the lines of my data I am seeing one line, repeated 4 times and nothing else.
Can anyone see what the issue is here?
from bs4 import BeautifulSoup
import requests
import csv
headers = {'User-Agent': 'Mozilla/5.0'}
for i in range(1, 300):
url = "xxx?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', class_='product-block__info')
for item in items:
product = item.find('span', class_='short_desc').text
stock = item.find('span', class_='count_product_stock hidden').text
brand = item.find('h4', class_='brand').text
price = item.find('span', class_='selling_price').text
# create a list of all the fields
sheets = [brand, product, stock, price]
print(sheets)
with open('csvfile.csv','wt') as file:
for l in sheets:
file.writelines(sheets)
file.write('\n')
You probably want something more like the following untested code. The example provided can't be run as is:
from bs4 import BeautifulSoup
import requests
import csv
headers = {'User-Agent': 'Mozilla/5.0'}
# Open the file once. See the csv documentation for the correct way to open
# a file for use with csv.writer. If you plan to open the .csv with
# Excel, the utf-8-sig encoding will allow non-ASCII to work correctly.
with open('csvfile.csv','w', encoding='utf-8-sig', newline='') as f:
file = csv.writer(f) # actually use the CSV module.
for i in range(1, 300):
url = "xxx?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', class_='product-block__info')
for item in items:
product = item.find('span', class_='short_desc').text
stock = item.find('span', class_='count_product_stock hidden').text
brand = item.find('h4', class_='brand').text
price = item.find('span', class_='selling_price').text
# create a list of all the fields
sheets = [brand, product, stock, price]
# write a single line.
file.writerow(sheets)
Here's a tested example that will open in Excel. I threw in a non-ASCII character and a comma in the data to demonstrate the csv module's ability to handle it:
#coding:utf8
import csv
with open('csvfile.csv','w', encoding='utf-8-sig', newline='') as f:
file = csv.writer(f)
file.writerow('BRAND PRODUCT STOCK PRICE'.split())
for i in range(1,11):
sheets = ['brand{}'.format(i),'pröduct{}'.format(i),'st,ock{}'.format(i),'price{}'.format(i)]
file.writerow(sheets)
Output:
BRAND,PRODUCT,STOCK,PRICE
brand1,pröduct1,"st,ock1",price1
brand2,pröduct2,"st,ock2",price2
brand3,pröduct3,"st,ock3",price3
brand4,pröduct4,"st,ock4",price4
brand5,pröduct5,"st,ock5",price5
brand6,pröduct6,"st,ock6",price6
brand7,pröduct7,"st,ock7",price7
brand8,pröduct8,"st,ock8",price8
brand9,pröduct9,"st,ock9",price9
brand10,pröduct10,"st,ock10",price10
In Excel: