I'm writing an script that reads in the differences between two CSV files. Once it is read out I am supposed to use a WebHook to contact a slack page with the results of the comparison. I am having difficulty in sending the Post Method.
The link supplied by slack generates a response of 400
with either /post or :8080 at the end you get a 200, but nothing pops up in the slack page.
Any thoughts or suggestions?
def main():
csvDiff()
#print(l)
post()
def csvDiff():
f = open("new.csv")
csv_f = csv.reader(f)
old=set(pd.read_csv("old.csv", index_col=False, header=None)[0]) #reads the csv, takes only the first column and creates a set out of it.
new=set(pd.read_csv("new.csv", index_col=False, header=None)[0]) #same here
diff = new - old
#Convert the diff set into a list
diff=list(diff)
#print(diff)
#print(newConnections)
for row in csv_f:
if row[0] in diff:
l.append(row)
def makeCsv():
l = pd.to_csv
def post():
url = 'whatever'
payload={"text": "A very important thing has occurred! <https://alert-system.com/alerts/1234|Click here> for details!"}
r = requests.post(url, data=json.dumps(l).encode('utf8'))
print(r)
if __name__ == "__main__":
main()
Try this line instead:
r = requests.post(url, json=payload)
Related
I am able to get the data from pdf to text.
But now i need to get the data in csv format with table structure.
I tried it to get the table structure with but it didn't happen.Any inputs?
Also, i'm able to generate it through json.
Is there a way to get the result into table csv format?
any inputs ?
Below is the code i have used.
import boto3
import time
# Document
s3BucketName = "textractanalysisexample"
documentName = "sheet_example.pdf"
def startJob(s3BucketName, objectName):
response = None
client = boto3.client('textract')
response = client.start_document_text_detection(
DocumentLocation={
'S3Object': {
'Bucket': s3BucketName,
'Name': objectName
}
})
return response["JobId"]
def isJobComplete(jobId):
# For production use cases, use SNS based notification
# Details at: https://docs.aws.amazon.com/textract/latest/dg/api-async.html
time.sleep(5)
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
#print("Job status: {}".format(status))
while(status == "IN_PROGRESS"):
time.sleep(5)
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
#print("Job status: {}".format(status))
return status
def getJobResults(jobId):
pages = []
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
pages.append(response)
print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
while(nextToken):
response = client.get_document_text_detection(JobId=jobId, NextToken=nextToken)
pages.append(response)
#print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
return pages
def lambda_handler(event, context):
jobId = startJob(s3BucketName, documentName)
#print("Started job with id: {}".format(jobId))
if(isJobComplete(jobId)):
response = getJobResults(jobId)
# Print detected text
for resultPage in response:
for item in resultPage["Blocks"]:
if item["BlockType"] == "LINE":
print (item["Text"]) ```
You can import CSV to write to a csv file like so:
import csv
with open('my_pdf.txt', 'r') as in_file:
stripped = (line.strip() for line in in_file)
lines = (line.split(",") for line in stripped if line)
with open('my_pdf.csv', 'w') as out_file:
writer = csv.writer(out_file)
writer.writerow(('title', 'intro'))
writer.writerows(lines)
You can just put in the rows you need, and this splits your data into comma separated values. You can see more information for CSV writer (and csv python in general) here (Python Docs).
Here is my code:
This whole script worked fine for the first 2-3 times but now is constantly sending 503 responses
The Internet was checked by me multiple times but there wasn't any problem with internet
from bs4 import BeautifulSoup
import requests, sys, os, json
def get_amazon_search_page(search):
search = search.strip().replace(" ", "+")
for i in range(3): # tries to connect and get request the amazon 3 times
try:
print("Searching...")
response = requests.get("https://www.amazon.in/s?k={}&ref=nb_sb_noss".format(search)) # search string will be manipulated by replacing all spaces with "+" in order to search from the website itself
print(response.status_code)
if response.status_code == 200:
return response.content, search
except Exception:
pass
print("Is the search valid for the site: https://www.amazon.in/s?k={}&ref=nb_sb_noss".format(search))
sys.exit(1)
def get_items_from_page(page_content):
print(page_content)
soup = BeautifulSoup(page_content, "html.parser") # soup for extracting information
items = soup.find_all("span", class_ = "a-size-medium a-color-base a-text-normal")
prices = soup.find_all("span", class_ = "a-price-whole")
item_list = []
total_price_of_all = 0
for item, price in zip(items, prices):
dict = {}
dict["Name"] = item.text
dict["Price"] = int(price.text)
total_price_of_all += int(price.text.replace(",", ""))
item_list.append(dict)
average_price = total_price_of_all/len(item_list)
file = open("items.json", "w")
json.dump(item_list, file, indent = 4)
print("Your search results are available in the items.json file")
print("Average prices for the search: {}".format(average_price))
file.close()
def main():
os.system("clear")
print("Note: Sometimes amazon site misbehaves by sending 503 responses, this can be due to heavy traffic on that site, please cooperate\n\n")
search = input("Enter product name: ").strip()
page_content = get_amazon_search_page(search)
get_items_from_page(page_content)
if __name__ == "__main__":
while True:
main()
Please Help !
The server blocks you from scraping it.
If you check the robots.txt, you can see that the link you are trying to request is disallowed:
Disallow: */s?k=*&rh=n*p_*p_*p_
However, a simple way to bypass this blocking would be to change your User-Agent (see here). By default, requests sends something like this "python-requests/2.22.0". Changing it to something more browser-like would temporarily work.
I have an application (written in PyQt5) that returns x, y, and elevation of a location. When the user fills up the x, y, and hits getz button, the app calls the function below:
def getz(self, i):
"""calculates the elevation"""
import urllib
url = "https://api.open-elevation.com/api/v1/lookup"
x = self.lineEditX.text()
y = self.lineEditY.text()
url = url + "\?locations\={},{}".format(x, y)
print(url)
if i is "pushButtonSiteZ":
response = urllib.request.Request(url)
fp= urllib.request.urlopen(response)
print('response is '+ response)
self.lineEditSiteZ.setText(fp)
according to Open Elevation guide, it says that you have to make requests in the form of:
curl https://api.open-elevation.com/api/v1/lookup\?locations\=50.3354,10.4567
in order to get elevation data as a JSON object. But in my case it returns an error saying:
raise RemoteDisconnected("Remote end closed connection without"
RemoteDisconnected: Remote end closed connection without response
and nothing happens. How can I fix this?
There is no other way than to create a loop (try until the response is ok). Because the Open Elevation API's handling of so many responses is still problematic. But the following piece of code works after a possibly long delay:
def getz(self, i):
import json
import requests
url = "https://api.open-elevation.com/api/v1/lookup"
"""calculates the elevation"""
if i is 'pushButtonSiteZ':
x = self.lineEditSiteX.text()
y = self.lineEditSiteY.text()
param = url + '?locations={},{}'.format(x,y)
print(param)
while True:
try:
response = requests.get(param)
print(response.status_code)
if str(response.status_code) == '200':
r = response.text
r = json.loads(r)
out = r['results'][0]['elevation']
print(out)
self.lineEditSiteZ.setText(str(out))
cal_rng(self)
break
except ConnectionError:
continue
except json.decoder.JSONDecodeError:
continue
except KeyboardInterrupt:
continue
except requests.exceptions.SSLError:
continue
except requests.exceptions.ConnectionError:
continue
I can't, for the life of me, figure out what is wrong with the following four lines of code.
def getAssetExistance(asset, element, table):
dynamoTable = dynamo.Table(table)
response = dynamoTable.query(KeyConditionExpression=Key(element).eq(asset)
return bool(response)
I am running this through aws Lambda and the log on cloudwatch is telling me the error is on the return line. This is the error (line 24 is the return line):
Syntax error in module 'lambda_function': invalid syntax (lambda_function.py, line 24)
In case this helps at all, here is the rest of the code:
################################
# Slack Lambda handler.
################################
import boto3
import os
import logging
import urllib
# Grab data from the environment.
BOT_TOKEN = os.environ["BOT_TOKEN"]
ASSET_TABLE = os.environ["ASSET_TABLE"]
REGION_NAME = os.getenv('REGION_NAME', 'us-east-1')
dynamo = boto3.client('dynamodb', region_name=REGION_NAME)
# Define the URL of the targeted Slack API resource.
SLACK_URL = "https://slack.com/api/chat.postMessage"
def getAssetExistance(asset, element, table):
dynamoTable = dynamo.Table(table)
response = dynamoTable.query(KeyConditionExpression=Key(element).eq(asset)
return bool(response)
def lambda_handler(data, context):
# Slack challenge answer.
if "challenge" in data:
return data["challenge"]
# Grab the Slack channel data.
slack_event = data['event']
slack_user = slack_event["user"]
slack_text = slack_event["text"]
channel_id = slack_event["channel"]
slack_userID = slack_user["ID"]
slack_reply = ""
# Ignore bot messages.
if "bot_id" in slack_event:
logging.warn("Ignore bot event")
else:
# Start data sift.
if slack_text.startswith("!networth"):
slack_reply = "Your networth is: "
elif slack_text.startwith("!price"):
command,asset = text.split()
slack_reply = "The price of a(n) %s is: " % (asset)
elif slack_text.startwith("!Addme"):
if not getAssetExistance(slack_userID, userID, ASSET_TABLE):
slack_reply = "Adding user: %s(%s)" % (slack_user, slack_userID)
dynamo.update_item(TableName=ASSET_TABLE,
Key={'userID':{'S':'slack_userID'},
AttributeUpdates= {
'resources':{
'Action': 'ADD',
'Value': {'N': '1000'}
}
}
)
else
slack_reply = "User %s(%s) already exists" % (slack_user, slack_userID)
# We need to send back three pieces of information:
data = urllib.parse.urlencode(
(
("token", BOT_TOKEN),
("channel", channel_id),
("text", slack_reply)
)
)
data = data.encode("ascii")
# Construct the HTTP request that will be sent to the Slack API.
request = urllib.request.Request(
SLACK_URL,
data=data,
method="POST"
)
# Add a header mentioning that the text is URL-encoded.
request.add_header(
"Content-Type",
"application/x-www-form-urlencoded"
)
# Fire off the request!
urllib.request.urlopen(request).read()
# Everything went fine.
return "200 OK"
Hopefully I am doing something dumb; I am pretty new to all this. Any help is much appreciated it. Thanks!
You skipped closed round bracket in this line:
response = dynamoTable.query(KeyConditionExpression=Key(element).eq(asset)
replace this line by:
response = dynamoTable.query(KeyConditionExpression=Key(element)).eq(asset)
I have a web scraping program that gets multiple pages, but I have to set the while loop to a number. I want to make a condition that stops the loop once it reaches the last page or recognizes there are no more items to scrape. Assume I don't know how many pages exist. How do I change the while loop condition to make it stop without putting a random number?
import requests
from bs4 import BeautifulSoup
import csv
filename="output.csv"
f=open(filename, 'w', newline="",encoding='utf-8')
headers="Date, Location, Title, Price\n"
f.write(headers)
i=0
while i<5000:
if i==0:
page_link="https://portland.craigslist.org/search/sss?query=xbox&sort=date"
else:
page_link="https://portland.craigslist.org/search/sss?s={}&query=xbox&sort=date".format(i)
res=requests.get(page_link)
soup=BeautifulSoup(res.text,'html.parser')
for container in soup.select('.result-info'):
date=container.select('.result-date')[0].text
try:
location=container.select('.result-hood')[0].text
except:
try:
location=container.select('.nearby')[0].text
except:
location='NULL'
title=container.select('.result-title')[0].text
try:
price=container.select('.result-price')[0].text
except:
price="NULL"
print(date,location,title,price)
f.write(date+','+location.replace(","," ")+','+title.replace(","," ")+','+price+'\n')
i+=120
f.close()
I use while True to run endless loop and break to exit when there is no data
data = soup.select('.result-info')
if not data:
print('END: no data:')
break
I use module csv to save data so I don't have to use replace(","," ").
It will put text in " " if there is , in text.
s={} can be in any place after ? so I put it at the end to make code more readable.
Portal gives first page even if you use s=0 so I don't have to check i == 0
(BTW: in my code it has more readable name offset)
Full code.
import requests
from bs4 import BeautifulSoup
import csv
filename = "output.csv"
f = open(filename, 'w', newline="", encoding='utf-8')
csvwriter = csv.writer(f)
csvwriter.writerow( ["Date", "Location", "Title", "Price"] )
offset = 0
while True:
print('offset:', offset)
url = "https://portland.craigslist.org/search/sss?query=xbox&sort=date&s={}".format(offset)
response = requests.get(url)
if response.status_code != 200:
print('END: request status:', response.status)
break
soup = BeautifulSoup(response.text, 'html.parser')
data = soup.select('.result-info')
if not data:
print('END: no data:')
break
for container in data:
date = container.select('.result-date')[0].text
try:
location = container.select('.result-hood')[0].text
except:
try:
location = container.select('.nearby')[0].text
except:
location = 'NULL'
#location = location.replace(","," ") # don't need it with `csvwriter`
title = container.select('.result-title')[0].text
try:
price = container.select('.result-price')[0].text
except:
price = "NULL"
#title.replace(",", " ") # don't need it with `csvwriter`
print(date, location, title, price)
csvwriter.writerow( [date, location, title, price] )
offset += 120
f.close()