Unable to download a csv file from the URL with python script - python-3.x

I am accessing a url with username and password to download a csv and save the file with today's date nad time, there is only one download link on the page.
Is there any way I can achieve this task via python
I am using this below script I see the print output. but how can I download the the download csv button on the web-page. Normally when I click on the download csv button it asking me to save the file.
import re
import requests
from bs4 import BeautifulSoup
url = 'https://url.com'
login_data = dict(login='user#example.com', password='password-g')
session = requests.session()
link = 'https://url.com'
r = requests.get(link)
soup = BeautifulSoup(r.text, "html.parser")
for i in soup.find_all('a', {'class': "app-btn-down"}):
print(re.search('http://.*\b_file', i.get('href')).group(0)) # the CSV file name is b_file
print ("r.text")
As I'm new to python, so please forgive me for my bad explanation.

This is mostly pseudocode since i don't know the html data, but i think you'll get the idea.
First you have to submit your data to get the necessary cookies in your session (you can check the cookies with s.cookies). Keep in mind that there may be more fields that you have to submit other than login and password. Use this session for all your requests.
Then you can get the csv link with bs4 assuming it's not generated by js, otherwise you may have to use selenium.
import requests
from bs4 import BeautifulSoup
from time import gmtime, strftime
import os
s = requests.session()
url = 'https://url.com'
login_data = dict(login='user#example.com', password='password-g')
s.post(url, data=login_data)
link = 'https://url.com'
r = s.get(link)
soup = BeautifulSoup(r.text, "html.parser")
csv_link = soup.find('a', {'class':'app-btn-down', 'href':lambda h:'b_file' in h})['href']
csv_file = s.get(csv_link).text
Finally you can get the date and time with gmtime, and use strftime to format it.
date_time = strftime("%Y-%m-%d_%H-%M-%S", gmtime())
path = os.path.join('/some/dir', date_time)
with open(path, 'w') as f:
f.write(csv_file)

Related

Use beautifulsoup to download href links

Looking to download href links using beautifulsoup4, python 3 and requests library.
This is the code that I have now, I thought it would be tough to use regex in this situation but i'm not sure if this can be done using beautifulsoup3. I have to download all of the shape files from the grid and looking to automate this task. Thank You!
URL:
https://earth-info.nga.mil/index.php?dir=coordsys&action=gars-20x20-dloads
import requests
from bs4 import BeautifulSoup
import re
URL = 'https://earth-info.nga.mil/index.php?dir=coordsys&action=gars-20x20-dloads'
page = requests.get(URL)
soup = BeautifulSoup(page.content,'html.parser')
results = re.findall(r'<a[^>]* href="([^"]*)"', page)
print(results)
Those files are all associated with area tag so I would simply select those:
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://earth-info.nga.mil/index.php?dir=coordsys&action=gars-20x20-dloads')
soup = bs(r.content, 'lxml')
files = ['https://earth-info.nga.mil/' + i['href'] for i in soup.select('area')]
You can convert page to a string in order to search for all a's using regex.
Instead of:
results = re.findall(r'<a[^>]* href="([^"]*)"', page)
Use:
results = re.findall(r'<a[^>]* href="([^"]*)"', page.text)

How can I ensure that relative links are saved as absolute URLs in the output file?

I need to develop a web links scraper program in Python that extracts all of the unique web links that point out to other web pages from the HTML code of the "Current Estimates" web link, both from the "US Census Bureau" website (see web link below) and outside that domain, and that populates them in a comma-separated values (CSV) file as absolute uniform resource indicators (URIs).
I use the code below in Jupyter Notebook and it seems it generates a CSV but part of my code is generating a double https:// on the already absolute links when it should just be adding it to the relative links.
http:https://www.census.gov/data/training-workshops.html
http:https://www.census.gov/programs-surveys/sis.html
I need a better code that can change the relative links to absolute I believe the full_url = urljoin(url, link.get("href")) should be doing this, but something is incorrect.
How can I ensure that relative links are saved as absolute URLs in the output file?
import requests
from bs4 import BeautifulSoup, SoupStrainer
import csv
from urllib.parse import urljoin
import re
url = 'https://www.census.gov/programs-surveys/popest.html'
r = requests.get(url)
raw_html = r.text
print(r.text)
soup = BeautifulSoup(raw_html, 'html.parser')
print(soup.prettify())
for link in soup.find_all('a',href=True):
full_url = urljoin(url, link.get("href"))
print(link.get('href'))
links_set = set()
for link in soup.find_all(href=re.compile('a')):
print(link.get('href'))
for item in soup.find_all('a',href=re.compile(r'html')):
links_set.add(item.get('href'))
links = [x[:1]=='http' and x or 'http:'+x for x in links_set]
with open("C996FinalAssignment.csv", "w") as csv_file:
writer = csv.writer(csv_file,delimiter="\n")
writer.writerow(links)
Try this.
import requests
import csv
from simplified_scrapy.simplified_doc import SimplifiedDoc
url = 'https://www.census.gov/programs-surveys/popest.html'
r = requests.get(url)
raw_html = r.text
print(r.text)
doc = SimplifiedDoc(raw_html)
lstA = doc.listA(url=url) # It will help you turn relative links into absolute links
links = [a.url for a in lstA]
with open("C996FinalAssignment.csv", "w") as csv_file:
writer = csv.writer(csv_file,delimiter="\n")
writer.writerow(links)

Beautifulsoup response does not match with view source code output

While comparing response from code and chrome source code. I observe that response returned from beautifulsoup does not match with page source code. I want to fetch class="rc"and I can see the class with "rc" on page source code, but could not find it in the response printed. I checked with "lxml" and "html.parser" too.
I am beginner in python so my question might sound basic. Also, I already checked few articles related to my problem(BeautifulSoup returning different html than view source) but could not find solution.
Below is my code:
import sys, requests
import re
import docx
import webbrowser
from bs4 import BeautifulSoup
query = sys.argv
url = "https://google.com/search?q=" + "+".join(query[1:])
print(url)
res = requests.get(url)
# print(res[:1000])
if res.status_code == 200:
soup = BeautifulSoup(res.text, "html5lib")
print(type(soup))
all_select = soup.select("div", {"class": "rc"})
print("All Select ", all_select)
I had the same problem, try using another parser such as "lxml" instead of "html5lib".

How can I scrape data which is not having any of the source code?

scrape.py
# code to scrape the links from the html
from bs4 import BeautifulSoup
import urllib.request
data = open('scrapeFile','r')
html = data.read()
data.close()
soup = BeautifulSoup(html,features="html.parser")
# code to extract links
links = []
for div in soup.find_all('div', {'class':'main-bar z-depth-1'}):
# print(div.a.get('href'))
links.append('https://godamwale.com' + str(div.a.get('href')))
print(links)
file = open("links.txt", "w")
for link in links:
file.write(link + '\n')
print(link)
I have successfully got the list of links by using this code. But When I want to scrape the data from those links from their html page, these don't have any of the source code that contains data,and to extract them it my job tough . I have used selenium driver , but it won't work well for me.
I want to scrape the data from the below link , that contains data in the html sections , which have Customer details, licence and automation, commercial details, Floor wise, operational details . I want to extract these data with name , location , contact number and type.
https://godamwale.com/list/result/591359c0d6b269eecc1d8933
it 's link here . If someone finds solution , please give it to me.
Using Developer tools in your browser, you'll notice whenever you visit that link there is a request for https://godamwale.com/public/warehouse/591359c0d6b269eecc1d8933 that returns a json response probably containing the data you're looking for.
Python 2.x:
import urllib2, json
contents = json.loads(urllib2.urlopen("https://godamwale.com/public/warehouse/591359c0d6b269eecc1d8933").read())
print contents
Python 3.x:
import urllib.request, json
contents = json.loads(urllib.request.urlopen("https://godamwale.com/public/warehouse/591359c0d6b269eecc1d8933").read().decode('UTF-8'))
print(contents)
Here you go , the main problem with the site seems to be it takes time to load that's why it was returning incomplete page source. you have to wait until page loads completely. notice time.sleep(8) this line in code below :
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import time
CHROMEDRIVER_PATH ="C:\Users\XYZ\Downloads/Chromedriver.exe"
wd = webdriver.Chrome(CHROMEDRIVER_PATH)
responce = wd.get("https://godamwale.com/list/result/591359c0d6b269eecc1d8933")
time.sleep(8) # wait untill page loads completely
soup = BeautifulSoup(wd.page_source, 'lxml')
props_list = []
propvalues_list = []
div = soup.find_all('div', {'class':'row'})
for childtags in div[6].findChildren('div',{'class':'col s12 m4 info-col'}):
props = childtags.find("span").contents
props_list.append(props)
propvalue = childtags.find("p",recursive=True).contents
propvalues_list.append(propvalue)
print(props_list)
print(propvalues_list)
note: code will return Construction details in 2 seperate list.

Scraping the stackoverflow user data

import requests
from bs4 import BeautifulSoup
import csv
response = requests.get('https://stackoverflow.com/users?page=3&tab=reputation&filter=week').text
soup = BeautifulSoup(response, 'lxml')
for items in soup.select('.user-details'):
name = items.select("a")[0].text
location = items.select(".user-location")[0].text
reputation = items.select(".reputation-score")[0].text
print(name,location,reputation)
with open('stackdata.csv','a',newline='') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([name,location,reputation])
When we change the url of this code the output remains same.
I came across a similar problem. The solution that works for me is using selenium. Though I used headless browser i.e phantomjs I assume it should work for other browsers too.
driver = webdriver.PhantomJS('/home/practice/selenium/webdriver/phantomjs/bin/phantomjs')
users = []
page_num = 1
driver.get('https://stackoverflow.com/users?page={page_num}&tab=reputation&filter=week'.format(page_num=page_num))
content = driver.find_element_by_id('content')
for details in content.find_elements_by_class_name('user-details'):
users.append(details.text)
print(users)
Change the page_num to get the desired result.
Hope this will help!

Resources