I have been working on a requirement where i need to download the file from a website which outputs the data in csv format and then write it to a SQL table. I am using the below logic to download the data from a website.
import urllib
from bs4 import BeautifulSoup
url = "https://abcdef.oz/login"
response = opener.open(url)
if response.status == 200:
#webUrl = urllib.request.urlopen(url)
#print("Result code: " + str(webUrl.getcode()))
data = response.read().decode("utf-8")
soup = BeautifulSoup(data)
#Got a token
token = soup.find_all("meta", {"name":"token"}, limit=1)[0]['content']
print("token",token)
sdata = {"email": "abc#def.com", "password": "password"}
data = urllib.parse.urlencode(sdata).encode()
print("data",data)
url = "https://abcdef.oz.co/reports"
response = opener.open(url)
r = response.read().decode('utf-8')
print(r)
else:
print("No Response")
How can the response now be converted to a format where i can skip the header and write the data to a SQL table.
The output of the response is as below
"Col1","Col2","Col3"\n"abc","def","efg"\n"hij","klm","mno"
Thanks in advance
this is not mindblowing, but did you try:
df = pandas.read_html(url)
Related
I got the Report Id and using the Report Id i could get S3 Downloadable link. But when i try to use the link it shows - Access Unauthorised.
You need to do a GET request against the download link. The URL requires authentication as well, so the authorization header must be passed too.
Edit I am adding a function that I created that downloads the GZipped file and extracts the Json in it, in case it helps anyone else:
import requests
import gzip
import json
import io
def report_download():
req = requests.get(url, headers=headers)
response = req.content
zip_file = io.BytesIO(response)
with gzip.open(zip_file, 'rb') as f:
file_content = f.read()
json_data = json.loads(file_content)
with open("filename.json", "w") as outfile:
json.dump(json_data, outfile)
Here's an easy way to download the report with python and pandas
def api_download_report_resp(access_token, profile_id, report_id):
url = f"https://advertising-api.amazon.com/v2/reports/{report_id}/download"
client_id = os.getenv("AMAZON_ADS_API_CLIENT_ID")
with requests.Session() as sess:
sess.headers["Amazon-Advertising-API-ClientId"] = client_id
sess.headers["Amazon-Advertising-API-Scope"] = profile_id
sess.headers["Authorization"] = f"Bearer {access_token}"
sess.headers["Content-Type"] = "application/json"
resp = sess.get(url)
return resp
resp = api_download_report_resp(access_token, profile_id, report_id)
print(resp)
# <Response [200]>
import pandas as pd
import io
fobj = io.BytesIO()
fobj.write(resp.content)
fobj.seek(0)
df = pd.read_json(fobj, compression='gzip')
I am able to get the data from pdf to text.
But now i need to get the data in csv format with table structure.
I tried it to get the table structure with but it didn't happen.Any inputs?
Also, i'm able to generate it through json.
Is there a way to get the result into table csv format?
any inputs ?
Below is the code i have used.
import boto3
import time
# Document
s3BucketName = "textractanalysisexample"
documentName = "sheet_example.pdf"
def startJob(s3BucketName, objectName):
response = None
client = boto3.client('textract')
response = client.start_document_text_detection(
DocumentLocation={
'S3Object': {
'Bucket': s3BucketName,
'Name': objectName
}
})
return response["JobId"]
def isJobComplete(jobId):
# For production use cases, use SNS based notification
# Details at: https://docs.aws.amazon.com/textract/latest/dg/api-async.html
time.sleep(5)
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
#print("Job status: {}".format(status))
while(status == "IN_PROGRESS"):
time.sleep(5)
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
#print("Job status: {}".format(status))
return status
def getJobResults(jobId):
pages = []
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
pages.append(response)
print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
while(nextToken):
response = client.get_document_text_detection(JobId=jobId, NextToken=nextToken)
pages.append(response)
#print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
return pages
def lambda_handler(event, context):
jobId = startJob(s3BucketName, documentName)
#print("Started job with id: {}".format(jobId))
if(isJobComplete(jobId)):
response = getJobResults(jobId)
# Print detected text
for resultPage in response:
for item in resultPage["Blocks"]:
if item["BlockType"] == "LINE":
print (item["Text"]) ```
You can import CSV to write to a csv file like so:
import csv
with open('my_pdf.txt', 'r') as in_file:
stripped = (line.strip() for line in in_file)
lines = (line.split(",") for line in stripped if line)
with open('my_pdf.csv', 'w') as out_file:
writer = csv.writer(out_file)
writer.writerow(('title', 'intro'))
writer.writerows(lines)
You can just put in the rows you need, and this splits your data into comma separated values. You can see more information for CSV writer (and csv python in general) here (Python Docs).
I scraped Twitter for user name, Tweets, replies, retweets but can't save in a CSV file.
Here is the code:
from urllib.request import urlopen
from bs4 import BeautifulSoup
file = "5_twitterBBC.csv"
f = open(file, "w")
Headers = "tweet_user, tweet_text, replies, retweets\n"
f.write(Headers)
for page in range(0,5):
url = "https://twitter.com/BBCWorld".format(page)
html = urlopen(url)
soup = BeautifulSoup(html,"html.parser")
tweets = soup.find_all("div", {"class":"js-stream-item"})
for tweet in tweets:
try:
if tweet.find('p',{"class":'tweet-text'}):
tweet_user = tweet.find('span',{"class":'username'}).text.strip()
tweet_text = tweet.find('p',{"class":'tweet-text'}).text.encode('utf8').strip()
replies = tweet.find('span',{"class":"ProfileTweet-actionCount"}).text.strip()
retweets = tweet.find('span', {"class" : "ProfileTweet-action--retweet"}).text.strip()
print(tweet_user, tweet_text, replies, retweets)
f.write("{}".format(tweet_user).replace(",","|")+ ",{}".format(tweet_text)+ ",{}".format( replies).replace(",", " ")+ ",{}".format(retweets) + "\n")
except: AttributeError
f.close()
I get data but can't save in CSV file. Someone explain me how to save data in CSV file.
As you can see, you've only made a small error in finding the tweets here tweets = soup.find_all("div", {"class":"js-stream-item"}), you forgot to pass on the argument key name which should be like this tweets = soup.find_all("div", attrs={"class":"js-stream-item"})
This is a working solution but it only fetches the first 20 tweets
from urllib.request import urlopen
from bs4 import BeautifulSoup
file = "5_twitterBBC.csv"
f = open(file, "w")
Headers = "tweet_user, tweet_text, replies, retweets\n"
f.write(Headers)
url = "https://twitter.com/BBCWorld"
html = urlopen(url)
soup = BeautifulSoup(html, "html.parser")
# Gets the tweet
tweets = soup.find_all("li", attrs={"class":"js-stream-item"})
# Writes tweet fetched in file
for tweet in tweets:
try:
if tweet.find('p',{"class":'tweet-text'}):
tweet_user = tweet.find('span',{"class":'username'}).text.strip()
tweet_text = tweet.find('p',{"class":'tweet-text'}).text.encode('utf8').strip()
replies = tweet.find('span',{"class":"ProfileTweet-actionCount"}).text.strip()
retweets = tweet.find('span', {"class" : "ProfileTweet-action--retweet"}).text.strip()
# String interpolation technique
f.write(f'{tweet_user},/^{tweet_text}$/,{replies},{retweets}\n')
except: AttributeError
f.close()
filename = "output.csv"
f = open(filename, "w",encoding="utf-8")
headers = " tweet_user, tweet_text, replies, retweets \n"
f.write(headers)
***your code***
***loop****
f.write(''.join(tweet_user + [","] + tweet_text + [","] + replies + [","] + retweets + [","] + ["\n"]) )
f.close()
I am trying to import data from the following URLs, and write each data set to a CSV file.
Here are a few sample URls that I want to grab fundamental data from:
https://finviz.com/quote.ashx?t=sbuc
https://finviz.com/quote.ashx?t=msft
https://finviz.com/quote.ashx?t=aapl
How can I import the data from 'Index' to 'Change'?
I think the script should, basically, look like this.
import csv
import urllib.request
from bs4 import BeautifulSoup
soup = BeautifulSoup("html.parser")
url_base = "https://finviz.com/quote.ashx?t="
tckr = ['SBUX','MSFT','AAPL']
for stocks in tckr:
url_list = [url_base + tckr]
with open('C:/Users/Excel/Desktop/today.csv', 'a', newline='') as file:
writer = csv.writer(file)
for url in url_list:
try:
fpage = urllib.request.urlopen(url)
fsoup = BeautifulSoup(fpage, 'html.parser')
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
except urllib.error.HTTPError:
print("{} - not found".format(url))
Except, when I run it, I get this error message: SyntaxError: EOL while scanning string literal
import csv
import requests
from bs4 import BeautifulSoup
url_base = "https://finviz.com/quote.ashx?t="
tckr = ['SBUX','MSFT','AAPL']
url_list = [url_base + s for s in tckr]
with open('../Python/SOtest.csv', 'a', newline='') as f:
writer = csv.writer(f)
for url in url_list:
try:
fpage = requests.get(url)
fsoup = BeautifulSoup(fpage.content, 'html.parser')
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
except HTTPError:
print("{} - not found".format(url))
I use requests so there is that difference. But it works so you can pull code from there if need be.
I am trying to retrieve data which is present in the table from the url : https://www.infogreffe.com/recherche-entreprise-dirigeants/resultats-entreprise-dirigeants.html?ga_cat=globale&ga_q=IMERYS#phrase=IMERYS
As i observed the data present in the inner HTML table.
I have tried the:
from bs4 import BeautifulSoup
import requests
url = 'https://www.infogreffe.com/recherche-entreprise-dirigeants/resultats-entreprise-dirigeants.html?ga_cat=globale&ga_q=IMERYS#phrase=IMERYS'
r = requests.get(url)
res = r.text
soup = BeautifulSoup(res,'lxml')
data = []
table = soup.find_all(class_= 'dojoxGrid')
try:
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele])
except:
print("")
print(data)
but it returns nothing. Please help to get data from the table and their logic.
Thanks in Advance