Convert Web url to Image - python-3.x

I am trying to take screenshot of an URL but somehow it takes the screenshot of the gateway because of restricted entry. So tried adding ID and password to open the link but it does not for reason, could you help?
import requests
import urllib.parse
BASE = 'https://mini.s-shot.ru/1024x0/JPEG/1024/Z100/?' # we can modify size, format, zoom as needed
url = 'https://mail.google.com/mail/'#or whatever link you need
url = urllib.parse.quote_plus(url) #
print(url)
Id="XXXXXX"
import getpass
key = getpass.getpass('Password :: ')
path = 'target1.jpg'
response = requests.get(BASE + url+Id+Password, stream=True)
if response.status_code == 200:
with open(path, 'wb') as file:
for chunk in response:
file.write(chunk)
Thanks!

Related

Download pdf file using python3

I want to download this website 's pdf file using python3 https://qingarchives.npm.edu.tw/index.php?act=Display/image/207469Zh18QEz#74l
This might accomplish what you're trying to achieve:
import requests
# URL to be downloaded
url = "https://cfm.ehu.es/ricardo/docs/python/Learning_Python.pdf"
def download_pdf(url, file_name):
# Send GET request
response = requests.get(url)
# Save the PDF
with open(file_name, "wb") as f:
f.write(response.content)
download_pdf(url, 'myDownloadedFile.pdf')
from urllib import request
response = request.urlretrieve("https://cfm.ehu.es/ricardo/docs/python/Learning_Python.pdf", "learing_python.pdf")
or
import wget
URL = "https://cfm.ehu.es/ricardo/docs/python/Learning_Python.pdf"
response = wget.download(URL, ".learing_python.pdf")

Getting incorrect link on parsing web page in BeautifulSoup

I'm trying to get the download link from the button in this page. But when I open the download link that I get from my code I get this message
I noticed that if I manually click the button and open the link in a new page the csrfKey part of the link is always same whereas when I run the code I get a different key every time. Here's my code
from bs4 import BeautifulSoup
import requests
import re
def GetPage(link):
source_new = requests.get(link).text
soup_new = BeautifulSoup(source_new, 'lxml')
container_new = soup_new.find_all(class_='ipsButton')
for data_new in container_new:
#print(data_new)
headline = data_new # Display text
match = re.findall('download', str(data_new), re.IGNORECASE)
if(match):
print(f'{headline["href"]}\n')
if __name__ == '__main__':
link = 'https://eci.gov.in/files/file/10985-5-number-and-types-of-constituencies/'
GetPage(link)
Before you get to the actual download links of the files, you need to agree to Terms and Conditions. So, you need to fake this with requests and then parse the next page you get.
Here's how:
import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
link = 'https://eci.gov.in/files/file/10985-5-number-and-types-of-constituencies/'
with requests.Session() as connection:
r = connection.get("https://eci.gov.in/")
confirmation_url = BeautifulSoup(
connection.get(link).text, 'lxml'
).select_one(".ipsApp .ipsButton_fullWidth")["href"]
fake_agree_to_continue = connection.get(
confirmation_url.replace("?do=download", "?do=download&confirm=1")
).text
download_links = [
a["href"] for a in
BeautifulSoup(
fake_agree_to_continue, "lxml"
).select(".ipsApp .ipsButton_small")[1:]]
for download_link in download_links:
response = connection.get(download_link)
file_name = (
response
.headers["Content-Disposition"]
.replace('"', "")
.split(" - ")[-1]
)
print(f"Downloading: {file_name}")
with open(file_name, "wb") as f:
f.write(response.content)
This should output:
Downloading: Number And Types Of Constituencies.pdf
Downloading: Number And Types Of Constituencies.xls
And save two files: a .pdf and a .xls. The later one looks like this:

Downloading/webscraping images from python

I am trying to download all the images from the website but been unable to do so. How I can download all the images from a specific section of a website and save it to my directory?
The below code exports all the image and saves the image link to a csv file, but I also want the image to save it in my directory also.
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card'
req = Request(my_url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
page_soup = soup(webpage, "html.parser")
filename = "abc.csv"
f = open(filename, "w")
headers = "imagelink\n"
f.write(headers)
snackcrisps = page_soup.findAll("div",{"class":"divCategories divShops-newegg"})
crispitem = snackcrisps[0]
img = crispitem.findAll("div",{"class":"product_image_div productSmall_image_div_lit"})
img1 = img[0]
for img1 in img:
img2 = img1.findAll('img')
imageLink = img2[0].get('src')
print("imageLink: " + imageLink)
f.write(imageLink + "\n")
f.close()
How can I save the images in my local directory? Help needed!!
Many Thanks
I used the response to this post to formulate my answer.
First you need to build the full URL for the image you want. This could be as simple as appending "https:" to the beginning of the image link, or not changing the value at all. You'll have to investigate (review this post) how to adjust the URLs you find based on whether or not they are relative or absolute.
You'll want to use the requests module to make the request for the image.
import requests
import shutil
for img1 in img:
img2 = img1.findAll('img')
imageLink = img2[0].get('src')
if not "https:" in imageLink:
imageLink = "https:" + imageLink
r = requests.get(imageLink, stream=True)
if r.response = 200:
with open("my_file.jpg", 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)

How to scrape all the test match details in cricinfo

I am trying to scrape all the test match details but it is showing HTTP Error 504: Gateway Timeout I am getting the details of test matches but it is not showing this is my code i have used bs4 to scrape the test match details from cricinfo
I need to scrape the details of 2000 test matches this is my code
import urllib.request as req
BASE_URL = 'http://www.espncricinfo.com'
if not os.path.exists('./espncricinfo-fc'):
os.mkdir('./espncricinfo-fc')
for i in range(0, 2000):
soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=test;all=1;page=' + str(i)).read())
time.sleep(1)
for new_host in soupy.findAll('a', {'class' : 'srchPlyrNmTxt'}):
try:
new_host = new_host['href']
except:
continue
odiurl =BASE_URL + urljoin(BASE_URL,new_host)
new_host = unicodedata.normalize('NFKD', new_host).encode('ascii','ignore')
print(new_host)
html = req.urlopen(odiurl).read()
if html:
with open('espncricinfo-fc/{0!s}'.format(str.split(new_host, "/")[4]), "wb") as f:
f.write(html)
print(html)
else:
print("no html")
this is usually happen when doing multiple request too fast, it can be the server is down or your connection blocked by server firewall, try increase your sleep() or add random sleep.
import random
.....
for i in range(0, 2000):
soupy = BeautifulSoup(....)
time.sleep(random.randint(2,6))
not sure why, seems to be working for me.
I made a few changes in the loop through the links. I'm not sure how you're wanting the output to look in terms of writing it to your file, so I left that part alone. But like I said, seems to be working ok on my end.
import bs4
import requests
import os
import time
import urllib.request as req
BASE_URL = 'http://www.espncricinfo.com'
if not os.path.exists('C:/espncricinfo-fc'):
os.mkdir('C:/espncricinfo-fc')
for i in range(0, 2000):
i=0
url = 'http://search.espncricinfo.com/ci/content/match/search.html?search=test;all=1;page=%s' %i
html = requests.get(url)
print ('Checking page %s of 2000' %(i+1))
soupy = bs4.BeautifulSoup(html.text, 'html.parser')
time.sleep(1)
for new_host in soupy.findAll('a', {'class' : 'srchPlyrNmTxt'}):
try:
new_host = new_host['href']
except:
continue
odiurl = BASE_URL + new_host
new_host = odiurl
print(new_host)
html = req.urlopen(odiurl).read()
if html:
with open('C:/espncricinfo-fc/{0!s}'.format('_'.join(str.split(new_host, "/")[4:])), "wb") as f:
f.write(html)
#print(html)
else:
print("no html")

python requests login with captcha

I'm trying to download a captcha image, solve it manually, then submit it along with a username and password in a POST. My response text is simply the original sign-in page, so I assume that means my code is failing. The webpage I am signing into is on the dark web but I don't know if that is actually relevant. The only thing I can think of is that the act of submitting the POST is generating a new captcha. Hopefully someone with a better understanding of HTTP can help me.
from bs4 import BeautifulSoup
import base64
import requests
session = requests.Session()
session.proxies = {'http': 'socks5h://127.0.0.1:9150', 'https': 'socks5h://127.0.0.1:9150'}
url = session.get("http://waeixxcraed4gw7q.onion/signin")
soup = BeautifulSoup(url.text, "lxml")
imgs = soup.findAll('img')
#save captcha from base64 encoding
img_data = bytes(imgs[1]['src'][23:],encoding='utf-8')
with open("olympus_captcha.jpg","wb") as fh:
fh.write(base64.decodestring(img_data))
#solve the captcha that has been saved to the harddrive
captcha = input("enter captcha:\n")
#attempt login (password and username removed)
payload = {"username":username, "password":password, "captcha":captcha}
response = session.post("http://waeixxcraed4gw7q.onion/signin", data = payload)
print(response.text)
I did this in the following way,
First install tesseract on your system
session = requests.session()
url = "Login page link"
r = session.get(url)
soup = bs(r.text, 'html.parser')
img = soup.find('img', id='captcha')
img_SRC = img['src']
with open('captcha.jpg', 'wb') as handle:
response = requests.get(img_SRC, stream=True)
if not response.ok:
print("ok")
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
try:
from PIL import Image
except ImportError:
import Image
import pytesseract
pic = Image.open('result/captcha.jpg')
pytesseract.pytesseract.tesseract_cmd = r'Enter the installation path of the \tesseract.exe'
captchaText = pytesseract.image_to_string(pic)
The text from Capcha can be used for login

Resources