How to download an image from the internet using google colab jupyter - python-3.x

I need to download an image using a url. I managed to obtain the urls of the images I need to download, but now I'm lost on how to download it to my local computer. I'm using google colab/ jupyter. Thank you!
here's my code so far:
from bs4 import BeautifulSoup
import requests
import json
import urllib.request
#use Globe API to get data
#input userid - plan: have program read userids from csv or excel file
userid = xxxxxxxx
#use Globe API to get data
source = requests.get('https://api.globe.gov/search/v1/measurement/protocol/measureddate/userid/?protocols=land_covers&startdate=2020-05-04&enddate=2020-07-16&userid=' + str(userid) +'&geojson=FALSE&sample=FALSE').text
#set up BeautifulSoup4
soup = BeautifulSoup(source, 'lxml')
#Isolate the Json data and put it into a string called "paragraph"
body = soup.find('body')
paragraph = body.p.text
#load the string into a python object
data = json.loads(paragraph)
#pick out the needed information and store them
for landcover in data['results']:
siteId = landcover['siteId']
measuredDate = landcover['measuredDate']
latitude = landcover['latitude']
longitude = landcover['longitude']
protocol = landcover['protocol']
DownURL = landcover['data']['landcoversDownwardPhotoUrl']
#Here is where I want to download the url contained in 'DownURL'

Try
from google.colab import files as FILE
import os
img_data = requests.get(DownURL).content
with open('image_name.jpg', 'wb') as handler:
handler.write(img_data)
FILE.download('image_name.jpg')
os.remove('image_name.jpg') # to save up space
You can call a random function in case you do not wish to set an image name or a counter variable which keeps increments at each loop iteration.

Related

Why not full data?

I try to get all specific span tags in all 3 urls
but finally the csv file only shows the data of last url.
Python code
from selenium import webdriver
from lxml import etree
from bs4 import BeautifulSoup
import time
import pandas as pd
urls = []
for i in range(1, 4):
if i == 1:
url = "https://www.coinbase.com/price/s/listed"
urls.append(url)
else:
url = "https://www.coinbase.com/price/s/listed" + f"?page={i}"
urls.append(url)
print(urls)
for url in urls:
wd = webdriver.Chrome()
wd.get(url)
time.sleep(30)
resp =wd.page_source
html = BeautifulSoup(resp,"lxml")
tr = html.find_all("tr",class_="AssetTableRowDense__Row-sc-14h1499-1 lfkMjy")
print(len(tr))
names =[]
for i in tr:
name1 = i.find("span",class_="TextElement__Spacer-hxkcw5-0 cicsNy Header__StyledHeader-sc-1xiyexz-0 kwgTEs AssetTableRowDense__StyledHeader-sc-14h1499-14 AssetTableRowDense__StyledHeaderDark-sc-14h1499-17 cWTMKR").text
name2 = i.find("span",class_="TextElement__Spacer-hxkcw5-0 cicsNy Header__StyledHeader-sc-1xiyexz-0 bjBkPh AssetTableRowDense__StyledHeader-sc-14h1499-14 AssetTableRowDense__StyledHeaderLight-sc-14h1499-15 AssetTableRowDense__TickerText-sc-14h1499-16 cdqGcC").text
names.append([name1,name2])
ns=pd.DataFrame(names)
date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
path = "/Users/paul/jpn traffic/coinbase/coinbase"
ns.to_csv(path+date+date+'.csv',index=None)
the result of 2 print() function, it returns nothing wrong:
print(urls):['https://www.coinbase.com/price/s/listed', 'https://www.coinbase.com/price/s/listed?page=2', 'https://www.coinbase.com/price/s/listed?page=3']
print(len(tr))
26
30
16
So what's wrong with my code? Why not full data?
BTW, if I want to run my code on cloud service everyday at a given time, which works better for me, as a green hand python learner? I don't need to store huge data on cloud, I just need python scripts sending emails to my box that's it.
Why not data? Answer is data is generating from backdoor meaning the site is using API that's why data is not with the help of BeautifulSoup. You can easily get data using api_url and requests. To get api_url go to chrome devtools then network tab then xhr tab and click header tab then you will get the url and click preview tab to see data.
Now, data is generating:
import requests
r = requests.get('https://www.coinbase.com/api/v2/assets/search?base=BDT&country=BD&filter=listed&include_prices=true&limit=30&order=asc&page=2&query=&resolution=day&sort=rank')
coinbase = r.json()['data']
for coin in coinbase:
print(coin['name'])

Problem exporting Web Url results into CSV using beautifulsoup3

Problem: I tried to export results (Name, Address, Phone) into CSV but the CSV code not returning expected results.
#Import the installed modules
import requests
from bs4 import BeautifulSoup
import json
import re
import csv
#To get the data from the web page we will use requests get() method
url = "https://www.lookup.pk/dynamic/search.aspx?searchtype=kl&k=gym&l=lahore"
page = requests.get(url)
# To check the http response status code
print(page.status_code)
#Now I have collected the data from the web page, let's see what we got
print(page.text)
#The above data can be view in a pretty format by using beautifulsoup's prettify() method. For this we will create a bs4 object and use the prettify method
soup = BeautifulSoup(page.text, 'lxml')
print(soup.prettify())
#Find all DIVs that contain Companies information
product_name_list = soup.findAll("div",{"class":"CompanyInfo"})
#Find all Companies Name under h2tag
company_name_list_heading = soup.findAll("h2")
#Find all Address on page Name under a tag
company_name_list_items = soup.findAll("a",{"class":"address"})
#Find all Phone numbers on page Name under ul
company_name_list_numbers = soup.findAll("ul",{"class":"submenu"})
Created for loop to print out all company Data
for company_address in company_name_list_items:
print(company_address.prettify())
# Create for loop to print out all company Names
for company_name in company_name_list_heading:
print(company_name.prettify())
# Create for loop to print out all company Numbers
for company_numbers in company_name_list_numbers:
print(company_numbers.prettify())
Below is the code to export the results (name, address & phonenumber) into CSV
outfile = open('gymlookup.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["name", "Address", "Phone"])
product_name_list = soup.findAll("div",{"class":"CompanyInfo"})
company_name_list_heading = soup.findAll("h2")
company_name_list_items = soup.findAll("a",{"class":"address"})
company_name_list_numbers = soup.findAll("ul",{"class":"submenu"})
Here is the for loop to loop over data.
for company_name in company_name_list_heading:
names = company_name.contents[0]
for company_numbers in company_name_list_numbers:
names = company_numbers.contents[1]
for company_address in company_name_list_items:
address = company_address.contents[1]
writer.writerow([name, Address, Phone])
outfile.close()
You need to work on understanding how for loops work, and also the difference between strings, and variables and other datatypes. You also need to work on using what you have seen from other stackoverflow questions and learn to apply that. This is essentially the same as youre other 2 questions you already posted, but just a different site you're scraping from (but I didn't flag it as a duplicate, as you're new to stackoverflow and web scrpaing and I remember what it was like to try to learn). I'll still answer your questions, but eventually you need to be able to find the answers on your own and learn how to adapt it and apply (coding isn't a paint by colors. Which I do see you are adapting some of it. Good job in finding the "div",{"class":"CompanyInfo"} tag to get the company info)
That data you are pulling (name, address, phone) needs to be within a nested loop of the div class=CompanyInfo element/tag. You could theoretically have it the way you have it now, by putting those into a list, and then writing to the csv file from your lists, but theres a risk of data missing and then your data/info could be off or not with the correct corresponding company.
Here's what the full code looks like. notice that the variables are stored with in the loop, and then written. It then goes to the next block of CompanyInfo and continues.
#Import the installed modules
import requests
from bs4 import BeautifulSoup
import csv
#To get the data from the web page we will use requests get() method
url = "https://www.lookup.pk/dynamic/search.aspx?searchtype=kl&k=gym&l=lahore"
page = requests.get(url)
# To check the http response status code
print(page.status_code)
#Now I have collected the data from the web page, let's see what we got
print(page.text)
#The above data can be view in a pretty format by using beautifulsoup's prettify() method. For this we will create a bs4 object and use the prettify method
soup = BeautifulSoup(page.text, 'html.parser')
print(soup.prettify())
outfile = open('gymlookup.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Name", "Address", "Phone"])
#Find all DIVs that contain Companies information
product_name_list = soup.findAll("div",{"class":"CompanyInfo"})
# Now loop through those elements
for element in product_name_list:
# Takes 1 block of the "div",{"class":"CompanyInfo"} tag and finds/stores name, address, phone
name = element.find('h2').text
address = element.find('address').text.strip()
phone = element.find("ul",{"class":"submenu"}).text.strip()
# writes the name, address, phone to csv
writer.writerow([name, address, phone])
# now will go to the next "div",{"class":"CompanyInfo"} tag and repeats
outfile.close()

How to download bulk amount of images from google or any website

actually, I need to do a project on machine learning. In that I want a lot of images for training. I searched for this problem, but I failed to do so.
can anyone help me to solve this. Thanks in advance.
I used google images to download images using selenium. It is just a basic approach.
from selenium import webdriver
import time
import urllib.request
import os
from selenium.webdriver.common.keys import Keys
browser = webdriver.Chrome("path\\to\\the\\webdriverFile")
browser.get("https://www.google.com")
search = browser.find_element_by_name(‘q’)
search.send_keys(key_words,Keys.ENTER) # use required key_words to download images
elem = browser.find_element_by_link_text(‘Images’)
elem.get_attribute(‘href’)
elem.click()
value = 0
for i in range(20):
browser.execute_script(“scrollBy(“+ str(value) +”,+1000);”)
value += 1000
time.sleep(3)
elem1 = browser.find_element_by_id(‘islmp’)
sub = elem1.find_elements_by_tag_name(“img”)
try:
os.mkdir(‘downloads’)
except FileExistsError:
pass
count = 0
for i in sub:
src = i.get_attribute('src')
try:
if src != None:
src = str(src)
print(src)
count+=1
urllib.request.urlretrieve(src,
os.path.join('downloads','image'+str(count)+'.jpg'))
else:
raise TypeError
except TypeError:
print('fail')
if count == required_images_number: ## use number as required
break
check this for detailed explanation.
download driver here
My tip to you is: Use pictures API. This is my favourite: Bing Image Search API
Following text from Send search queries using the REST API and Python.
Running the quickstart
To get started, set subscription_key to a valid subscription key for the Bing API service.
Python
subscription_key = None
assert subscription_key
Next, verify that the search_url endpoint is correct. At this writing, only one endpoint is used for Bing search APIs. If you encounter authorization errors, double-check this value against the Bing search endpoint in your Azure dashboard.
Python
search_url = "https://api.cognitive.microsoft.com/bing/v7.0/images/search"
Set search_term to look for images of puppies.
Python
search_term = "puppies"
The following block uses the requests library in Python to call out to the Bing search APIs and return the results as a JSON object. Observe that we pass in the API key via the headers dictionary and the search term via the params dictionary. To see the full list of options that can be used to filter search results, refer to the REST API documentation.
Python
import requests
headers = {"Ocp-Apim-Subscription-Key" : subscription_key}
params = {"q": search_term, "license": "public", "imageType": "photo"}
response = requests.get(search_url, headers=headers, params=params)
response.raise_for_status()
search_results = response.json()
The search_results object contains the actual images along with rich metadata such as related items. For example, the following line of code can extract the thumbnail URLS for the first 16 results.
Python
thumbnail_urls = [img["thumbnailUrl"] for img in search_results["value"][:16]]
Then use the PIL library to download the thumbnail images and the matplotlib library to render them on a $4 \times 4$ grid.
Python
%matplotlib inline
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
f, axes = plt.subplots(4, 4)
for i in range(4):
for j in range(4):
image_data = requests.get(thumbnail_urls[i+4*j])
image_data.raise_for_status()
image = Image.open(BytesIO(image_data.content))
axes[i][j].imshow(image)
axes[i][j].axis("off")
plt.show()
Sample JSON response
Responses from the Bing Image Search API are returned as JSON. This sample response has been truncated to show a single result.
JSON
{
"_type":"Images",
"instrumentation":{
"_type":"ResponseInstrumentation"
},
"readLink":"images\/search?q=tropical ocean",
"webSearchUrl":"https:\/\/www.bing.com\/images\/search?q=tropical ocean&FORM=OIIARP",
"totalEstimatedMatches":842,
"nextOffset":47,
"value":[
{
"webSearchUrl":"https:\/\/www.bing.com\/images\/search?view=detailv2&FORM=OIIRPO&q=tropical+ocean&id=8607ACDACB243BDEA7E1EF78127DA931E680E3A5&simid=608027248313960152",
"name":"My Life in the Ocean | The greatest WordPress.com site in ...",
"thumbnailUrl":"https:\/\/tse3.mm.bing.net\/th?id=OIP.fmwSKKmKpmZtJiBDps1kLAHaEo&pid=Api",
"datePublished":"2017-11-03T08:51:00.0000000Z",
"contentUrl":"https:\/\/mylifeintheocean.files.wordpress.com\/2012\/11\/tropical-ocean-wallpaper-1920x12003.jpg",
"hostPageUrl":"https:\/\/mylifeintheocean.wordpress.com\/",
"contentSize":"897388 B",
"encodingFormat":"jpeg",
"hostPageDisplayUrl":"https:\/\/mylifeintheocean.wordpress.com",
"width":1920,
"height":1200,
"thumbnail":{
"width":474,
"height":296
},
"imageInsightsToken":"ccid_fmwSKKmK*mid_8607ACDACB243BDEA7E1EF78127DA931E680E3A5*simid_608027248313960152*thid_OIP.fmwSKKmKpmZtJiBDps1kLAHaEo",
"insightsMetadata":{
"recipeSourcesCount":0,
"bestRepresentativeQuery":{
"text":"Tropical Beaches Desktop Wallpaper",
"displayText":"Tropical Beaches Desktop Wallpaper",
"webSearchUrl":"https:\/\/www.bing.com\/images\/search?q=Tropical+Beaches+Desktop+Wallpaper&id=8607ACDACB243BDEA7E1EF78127DA931E680E3A5&FORM=IDBQDM"
},
"pagesIncludingCount":115,
"availableSizesCount":44
},
"imageId":"8607ACDACB243BDEA7E1EF78127DA931E680E3A5",
"accentColor":"0050B2"
}
}

Opening Image from Website

I was trying to make a simple program to pull an image from the website xkcd.com, and I seem to be running into a problem where it returns list object has no attribute show. Anyone know how to fix this?
import requests
from lxml import html
r = requests.get("http://imgs.xkcd.com/comics/self_driving_issues.png")
tree = html.fromstring(r.content)
final = tree.xpath("""//*[#id="comic"]/img""")
final.show()
Your call to requests.get is retrieving the actual image, the byte code for the png. There is no html to parse or search for with xpath.
Note here, the content is bytes:
r = requests.get("http://imgs.xkcd.com/comics/self_driving_issues.png")
print(r.content)
b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x02\xe4\x00\x00\x01#\x08\x03\x00\x00\x00M\x7f\xe4\xc6\x00\x00\x00\x04gAMA\x00\x00\xb1\x8f
Here you see that you can save the results directly to disk.
import requests
r = requests.get("http://imgs.xkcd.com/comics/self_driving_issues.png")
with open("myimage.png", "wb") as f:
f.write(r.content)
[Edit] And to Show the image (you will need to install pillow.)
import requests
from PIL import Image
import io
r = requests.get("http://imgs.xkcd.com/comics/self_driving_issues.png")
img = Image.open(io.BytesIO(r.content))
img.show()

Unable to download a csv file from the URL with python script

I am accessing a url with username and password to download a csv and save the file with today's date nad time, there is only one download link on the page.
Is there any way I can achieve this task via python
I am using this below script I see the print output. but how can I download the the download csv button on the web-page. Normally when I click on the download csv button it asking me to save the file.
import re
import requests
from bs4 import BeautifulSoup
url = 'https://url.com'
login_data = dict(login='user#example.com', password='password-g')
session = requests.session()
link = 'https://url.com'
r = requests.get(link)
soup = BeautifulSoup(r.text, "html.parser")
for i in soup.find_all('a', {'class': "app-btn-down"}):
print(re.search('http://.*\b_file', i.get('href')).group(0)) # the CSV file name is b_file
print ("r.text")
As I'm new to python, so please forgive me for my bad explanation.
This is mostly pseudocode since i don't know the html data, but i think you'll get the idea.
First you have to submit your data to get the necessary cookies in your session (you can check the cookies with s.cookies). Keep in mind that there may be more fields that you have to submit other than login and password. Use this session for all your requests.
Then you can get the csv link with bs4 assuming it's not generated by js, otherwise you may have to use selenium.
import requests
from bs4 import BeautifulSoup
from time import gmtime, strftime
import os
s = requests.session()
url = 'https://url.com'
login_data = dict(login='user#example.com', password='password-g')
s.post(url, data=login_data)
link = 'https://url.com'
r = s.get(link)
soup = BeautifulSoup(r.text, "html.parser")
csv_link = soup.find('a', {'class':'app-btn-down', 'href':lambda h:'b_file' in h})['href']
csv_file = s.get(csv_link).text
Finally you can get the date and time with gmtime, and use strftime to format it.
date_time = strftime("%Y-%m-%d_%H-%M-%S", gmtime())
path = os.path.join('/some/dir', date_time)
with open(path, 'w') as f:
f.write(csv_file)

Resources