I copied the answer to this stack overflow quesiton Decoding base64 from POST to use in PIL
ie:
from PIL import Image
from io import BytesIO
import base64
data['img'] = '''R0lGODlhDwAPAKECAAAAzMzM/////wAAACwAAAAADwAPAAACIISPeQHsrZ5ModrLlN48CXF8m2iQ3YmmKqVlRtW4MLwWACH+H09wdGltaXplZCBieSBVbGVhZCBTbWFydFNhdmVyIQAAOw=='''
im = Image.open(BytesIO(base64.b64decode(data)))
and ran it in my text editor and keeps saying data is undefined but I can't figure out why.
Remove ['img']
from PIL import Image
from io import BytesIO
import base64
data = '''R0lGODlhDwAPAKECAAAAzMzM/////wAAACwAAAAADwAPAAACIISPeQHsrZ5ModrLlN48CXF8m2iQ3YmmKqVlRtW4MLwWACH+H09wdGltaXplZCBieSBVbGVhZCBTbWFydFNhdmVyIQAAOw=='''
im = Image.open(BytesIO(base64.b64decode(data)))
Just add data = dict() before the data['img'] = .... data need to be define as a dict before you can access the dict key with the bracket operator.
Related
I am trying to read bucket files without to saving them as a file:
import boto3
import botocore
from io import StringIO
import pandas as pd
s3 = boto3.resource('s3',config=botocore.config.Config(signature_version=botocore.UNSIGNED))
bucket = self.s3.Bucket('deutsche-boerse-xetra-pds')
objects = self.bucket.objects.filter(Prefix= date)
file = pd.read_csv(StringIO(self.bucket.Object(key=object.key).get().get('Body').read().decode('utf-8')))
This code works quite well. However, I would like to use concurrency (python asyncio) to speed up the reading process. I did a search into documentation but I could only find something for the download function but not for the get function.
Do you have any suggestion?
Thanks in advance.
I found out a solution which works with multiprocessing since my final goal was to reduce the processing time.
As follow the code:
def generate_bucket():
s3_resoursce = boto3.resource('s3',config=botocore.config.Config(signature_version=botocore.UNSIGNED))
xetra_bucket = s3_resoursce.Bucket('deutsche-boerse-xetra-pds')
return s3_resoursce, xetra_bucket
def read_csv(object):
s3local, bucket_local = self.generate_bucket()
return pd.read_csv(StringIO(bucket_local.Object(key=object).get().get('Body').read().decode('utf-8')))
def import_raw_data(date: List[str]) -> pd.DataFrame:
import multiprocessing
s3local, bucket_local2 = self.generate_bucket()
objects = [i.key for i in list(bucket_local2.objects.filter(Prefix= date[0]))]
with multiprocessing.Pool(multiprocessing.cpu_count()) as p:
df = pd.concat(p.map(self.read_csv, objects))
return df
For me it works, but I am sure that there could be the possibility to improve this code. I'm open to suggestions.
I follow the example code of "O'Reilly Web Scraping with Python: Collecting More Data from the Modern Web" and find it shows error.
The versions are:
python3.7.3, BeautifulSoup4
The code is as follows:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import random
import datetime
import codecs
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
random.seed(datetime.datetime.now())
def getLinks(articleUrl):
html = urlopen('http://en.wikipedia.org{}',format(articleUrl))
bs = BeautifulSoup(html, 'html.parser')
return bs.find('div', {'id':'bodyContent'}).find_all('a',
href=re.compile('^(/wiki/)((?!:).)*$'))
links = getLinks('/wiki/Kevin_Bacon')
links.encoding = 'utf8'
while len(links) > 0:
newArticle = links[random.randint(0, len(links)-1)].attrs['href']
print(newArticle)
links = getLinks(newArticle)
TypeError: POST data should be bytes, an iterable of bytes, or a file
object. It cannot be of type str.
Looking at this rather old question, I see the problem is a typo (and I VTCed as such):
html = urlopen('http://en.wikipedia.org{}',format(articleUrl))
^
That comma , should be a dot ., otherwise according to the documentation we are passing a second parameter data:
data must be an object specifying additional data to send to the server, or None if no such data is needed. Currently HTTP requests are the only ones that use data. The supported object types include bytes, file-like objects, and iterables of bytes-like objects.
Note the last sentence; the function expects an iterable of bytes, but format() returns a string, thus the error:
TypeError: POST data should be bytes, an iterable of bytes, or a file object. It cannot be of type str.
I am working on getting the transactions on Authorize.Net API.
I am using the same code sample and the SDK says that in order to switch to the production environment, I need to set the environment variable on the controller.
The link is here. I am not sure where should I add this line of code
createtransactioncontroller.setenvironment(constants.PRODUCTION)
Rest of the code is the here
Is this the right way to use the controller
import os
import sys
import imp
from datetime import datetime, timedelta
from authorizenet import apicontractsv1
from authorizenet.apicontrollers import getSettledBatchListController
from authorizenet.apicontrollers import createTransactionController
constants = imp.load_source('modulename', 'constants.py')
def get_settled_batch_list():
"""get settled batch list"""
createTransactionController.setenvironment(constants.PRODUCTION)
merchantAuth = apicontractsv1.merchantAuthenticationType()
I had this same error and the way I fixed it was I changed the file constants.py to credentials.py and then I changed the variable to MY_CONSTANTS but you can change them to be credentials if you want.
If it does doesn't work at that point you could try to hard code it instead with createtransactioncontroller.setenvironment('https://api2.authorize.net/xml/v1/request.api')
but if you don't then leave it to be constants.PRODUCTION
createtransactioncontroller = createTransactionController(createtransactionrequest)
createtransactioncontroller.setenvironment(constants.PRODUCTION)
# or createtransactioncontroller.setenvironment('https://api2.authorize.net/xml/v1/request.api')
createtransactioncontroller.execute()
I used a dictionary for my credentials(constants in your case) so mine looks a little different.
import imp
import os
import sys
import importlib
from authorizenet.constants import constants
from authorizenet import apicontractsv1
from authorizenet.apicontrollers import createTransactionController
from .credentials import MY_CONSTANTS
# retrieved from the constants file
merchantAuth = apicontractsv1.merchantAuthenticationType()
merchantAuth.name = MY_CONSTANTS['apiLoginId']
merchantAuth.transactionKey = MY_CONSTANTS['transactionKey']
I hope this helped you.
My dataframe data3 has 56000 rows, with an image thumbnail URL as one of its column values. I am evaluating whether each one of those images are low contrast or not. I let the code below run for 9 hours! but still no result, and the kernel was still busy. Can you please let me know whats wrong?
P.S. I tried the code with a subset of my dataframe (100 rows) and it took 3 seconds to succesfully run. Surely by that standard, 56000 rows should take 30 minutes. Is there a memory overrun happening with temp files or something?
Maybe I need to introduce a try block here to catch any exceptions (even though no error is showing)? I'm not sure how to do that.
from PIL import Image
import urllib.request
import skimage
def f(row):
URL=row['ThumbnailURL']
#URL = 'http://www.moma.org/media/W1siZiIsIjU5NDA1Il0sWyJwIiwiY29udmVydCIsIi1yZXNpemUgMzAweDMwMFx1MDAzZSJdXQ.jpg?sha=137b8455b1ec6167'
with urllib.request.urlopen(URL) as url:
with open('temp.jpg', 'wb') as f:
f.write(url.read())
tutu = Image.open('temp.jpg')
val=skimage.exposure.is_low_contrast(tutu, fraction_threshold=0.4, lower_percentile=1, upper_percentile=99, method='linear')
return val
data3['lowcontornot'] = data3.apply(f, axis=1)
The solution below avoids saving temporary images to disk, thus reducing the overhead:
def f(row):
url = row['ThumbnailURL']
img = Image.open(BytesIO(requests.get(url).content))
return is_low_contrast(img, fraction_threshold=0.4, lower_percentile=1,
upper_percentile=99, method='linear')
To give this code a try you need to include the following imports:
import requests
from io import BytesIO
from PIL import Image
from skimage.exposure import is_low_contrast
I was trying to make a simple program to pull an image from the website xkcd.com, and I seem to be running into a problem where it returns list object has no attribute show. Anyone know how to fix this?
import requests
from lxml import html
r = requests.get("http://imgs.xkcd.com/comics/self_driving_issues.png")
tree = html.fromstring(r.content)
final = tree.xpath("""//*[#id="comic"]/img""")
final.show()
Your call to requests.get is retrieving the actual image, the byte code for the png. There is no html to parse or search for with xpath.
Note here, the content is bytes:
r = requests.get("http://imgs.xkcd.com/comics/self_driving_issues.png")
print(r.content)
b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x02\xe4\x00\x00\x01#\x08\x03\x00\x00\x00M\x7f\xe4\xc6\x00\x00\x00\x04gAMA\x00\x00\xb1\x8f
Here you see that you can save the results directly to disk.
import requests
r = requests.get("http://imgs.xkcd.com/comics/self_driving_issues.png")
with open("myimage.png", "wb") as f:
f.write(r.content)
[Edit] And to Show the image (you will need to install pillow.)
import requests
from PIL import Image
import io
r = requests.get("http://imgs.xkcd.com/comics/self_driving_issues.png")
img = Image.open(io.BytesIO(r.content))
img.show()