Related
I can get the standard certificate information for an SSL connection in Python 3.3 via the getpeercert() method on the SSL socket. However, it doesn't seem to provide the chain like OpenSSL's "s_client" tool does.
Is there some way I can get this so that I can see if my IA certificate was configured properly?
s_client command-line:
openssl s_client -connect google.com:443
s_client result (just the first few lines):
$ openssl s_client -connect google.com:443
CONNECTED(00000003)
depth=2 C = US, O = GeoTrust Inc., CN = GeoTrust Global CA
verify error:num=20:unable to get local issuer certificate
verify return:0
---
Certificate chain
0 s:/C=US/ST=California/L=Mountain View/O=Google Inc/CN=*.google.com
i:/C=US/O=Google Inc/CN=Google Internet Authority G2
1 s:/C=US/O=Google Inc/CN=Google Internet Authority G2
i:/C=US/O=GeoTrust Inc./CN=GeoTrust Global CA
2 s:/C=US/O=GeoTrust Inc./CN=GeoTrust Global CA
i:/C=US/O=Equifax/OU=Equifax Secure Certificate Authority
---
Python 3.3 code:
import socket
from ssl import SSLContext # Modern SSL?
from ssl import HAS_SNI # Has SNI?
from pprint import pprint
def ssl_wrap_socket(sock, keyfile=None, certfile=None, cert_reqs=None,
ca_certs=None, server_hostname=None,
ssl_version=None):
context = SSLContext(ssl_version)
context.verify_mode = cert_reqs
if ca_certs:
try:
context.load_verify_locations(ca_certs)
# Py32 raises IOError
# Py33 raises FileNotFoundError
except Exception as e: # Reraise as SSLError
raise ssl.SSLError(e)
if certfile:
# FIXME: This block needs a test.
context.load_cert_chain(certfile, keyfile)
if HAS_SNI: # Platform-specific: OpenSSL with enabled SNI
return context.wrap_socket(sock, server_hostname=server_hostname)
return context.wrap_socket(sock)
hostname = 'www.google.com'
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect((hostname, 443))
sslSocket = ssl_wrap_socket(s,
ssl_version=2,
cert_reqs=2,
ca_certs='/usr/local/lib/python3.3/dist-packages/requests/cacert.pem',
server_hostname=hostname)
pprint(sslSocket.getpeercert())
s.close()
Code result:
{'issuer': ((('countryName', 'US'),),
(('organizationName', 'Google Inc'),),
(('commonName', 'Google Internet Authority G2'),)),
'notAfter': 'Sep 25 15:09:31 2014 GMT',
'notBefore': 'Sep 25 15:09:31 2013 GMT',
'serialNumber': '13A87ADB3E733D3B',
'subject': ((('countryName', 'US'),),
(('stateOrProvinceName', 'California'),),
(('localityName', 'Mountain View'),),
(('organizationName', 'Google Inc'),),
(('commonName', 'www.google.com'),)),
'subjectAltName': (('DNS', 'www.google.com'),),
'version': 3}
Thanks to the contributing answer by Aleksi, I found a bug/feature request that already requested this very thing: http://bugs.python.org/issue18233. Though the changes haven't been finalized, yet, they do have a patch that makes this available:
This is the test code which I've stolen from some forgotten source and reassembled:
import socket
from ssl import wrap_socket, CERT_NONE, PROTOCOL_SSLv23
from ssl import SSLContext, SSLError # Modern SSL?
from ssl import HAS_SNI # Has SNI?
from pprint import pprint
def ssl_wrap_socket(sock, keyfile=None, certfile=None, cert_reqs=None,
ca_certs=None, server_hostname=None,
ssl_version=None):
context = SSLContext(ssl_version)
context.verify_mode = cert_reqs
if ca_certs:
try:
context.load_verify_locations(ca_certs)
# Py32 raises IOError
# Py33 raises FileNotFoundError
except Exception as e: # Reraise as SSLError
raise SSLError(e)
if certfile:
# FIXME: This block needs a test.
context.load_cert_chain(certfile, keyfile)
if HAS_SNI: # Platform-specific: OpenSSL with enabled SNI
return (context, context.wrap_socket(sock, server_hostname=server_hostname))
return (context, context.wrap_socket(sock))
hostname = 'www.google.com'
print("Hostname: %s" % (hostname))
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect((hostname, 443))
(context, ssl_socket) = ssl_wrap_socket(s,
ssl_version=2,
cert_reqs=2,
ca_certs='/usr/local/lib/python3.3/dist-packages/requests/cacert.pem',
server_hostname=hostname)
pprint(ssl_socket.getpeercertchain())
s.close()
Output:
Hostname: www.google.com
({'issuer': ((('countryName', 'US'),),
(('organizationName', 'Google Inc'),),
(('commonName', 'Google Internet Authority G2'),)),
'notAfter': 'Sep 11 11:04:38 2014 GMT',
'notBefore': 'Sep 11 11:04:38 2013 GMT',
'serialNumber': '50C71E48BCC50676',
'subject': ((('countryName', 'US'),),
(('stateOrProvinceName', 'California'),),
(('localityName', 'Mountain View'),),
(('organizationName', 'Google Inc'),),
(('commonName', 'www.google.com'),)),
'subjectAltName': (('DNS', 'www.google.com'),),
'version': 3},
{'issuer': ((('countryName', 'US'),),
(('organizationName', 'GeoTrust Inc.'),),
(('commonName', 'GeoTrust Global CA'),)),
'notAfter': 'Apr 4 15:15:55 2015 GMT',
'notBefore': 'Apr 5 15:15:55 2013 GMT',
'serialNumber': '023A69',
'subject': ((('countryName', 'US'),),
(('organizationName', 'Google Inc'),),
(('commonName', 'Google Internet Authority G2'),)),
'version': 3},
{'issuer': ((('countryName', 'US'),),
(('organizationName', 'Equifax'),),
(('organizationalUnitName',
'Equifax Secure Certificate Authority'),)),
'notAfter': 'Aug 21 04:00:00 2018 GMT',
'notBefore': 'May 21 04:00:00 2002 GMT',
'serialNumber': '12BBE6',
'subject': ((('countryName', 'US'),),
(('organizationName', 'GeoTrust Inc.'),),
(('commonName', 'GeoTrust Global CA'),)),
'version': 3},
{'issuer': ((('countryName', 'US'),),
(('organizationName', 'Equifax'),),
(('organizationalUnitName',
'Equifax Secure Certificate Authority'),)),
'notAfter': 'Aug 22 16:41:51 2018 GMT',
'notBefore': 'Aug 22 16:41:51 1998 GMT',
'serialNumber': '35DEF4CF',
'subject': ((('countryName', 'US'),),
(('organizationName', 'Equifax'),),
(('organizationalUnitName',
'Equifax Secure Certificate Authority'),)),
'version': 3})
The answer above did not work out of the box.
After going through many options, I found this to be the simplest approach which requires minimum 3rd party libraries.
pip install pyopenssl certifi
import socket
from OpenSSL import SSL
import certifi
hostname = 'www.google.com'
port = 443
context = SSL.Context(method=SSL.TLSv1_METHOD)
context.load_verify_locations(cafile=certifi.where())
conn = SSL.Connection(context, socket=socket.socket(socket.AF_INET, socket.SOCK_STREAM))
conn.settimeout(5)
conn.connect((hostname, port))
conn.setblocking(1)
conn.do_handshake()
conn.set_tlsext_host_name(hostname.encode())
for (idx, cert) in enumerate(conn.get_peer_cert_chain()):
print(f'{idx} subject: {cert.get_subject()}')
print(f' issuer: {cert.get_issuer()})')
print(f' fingerprint: {cert.digest("sha1")}')
conn.close()
Here is a link to the original idea
https://gist.github.com/brandond/f3d28734a40c49833176207b17a44786
Here is a reference which brought me here How to get response SSL certificate from requests in python?
I'm not sure, but I think that part of the OpenSSL API just isn't available in Python's ssl-module.
It seems that the function SSL_get_peer_cert_chain is used to access the certificate chain in OpenSSL. See, for example, the section of openssl s_client that prints the output you included. On the other hand, grepping the source of Python's ssl-module for SSL_get_peer_cert_chain yields no matches.
M2Crypto and pyOpenSSL both seem to include a get_peer_cert_chain function, if you're willing to look at other (and non-stdlib) libraries. I can't vouch for them personally, though, since I haven't used them much.
This is a follow up to oglops answer as my server didn't support the standard method:
import socket
import sys
from OpenSSL import SSL
import certifi
hostname = "www.google.com"
port = 443
methods = [
(SSL.SSLv2_METHOD,"SSL.SSLv2_METHOD"),
(SSL.SSLv3_METHOD,"SSL.SSLv3_METHOD"),
(SSL.SSLv23_METHOD,"SSL.SSLv23_METHOD"),
(SSL.TLSv1_METHOD,"SSL.TLSv1_METHOD"),
(SSL.TLSv1_1_METHOD,"SSL.TLSv1_1_METHOD"),
(SSL.TLSv1_2_METHOD,"SSL.TLSv1_2_METHOD"),
]
for method,method_name in methods:
try:
print(f"\n-- Method {method_name}")
context = SSL.Context(method=method)
context.load_verify_locations(cafile=certifi.where())
conn = SSL.Connection(
context, socket=socket.socket(socket.AF_INET, socket.SOCK_STREAM)
)
conn.settimeout(5)
conn.connect((hostname, port))
conn.setblocking(1)
conn.do_handshake()
conn.set_tlsext_host_name(hostname.encode())
for (idx, cert) in enumerate(conn.get_peer_cert_chain()):
print(f"{idx} subject: {cert.get_subject()}")
print(f" issuer: {cert.get_issuer()})")
print(f' fingerprint: {cert.digest("sha1")}')
conn.close()
except:
print(f"<><> Method {method_name} failed due to {sys.exc_info()[0]}")
I am trying to scrape Myntra but I got errors. I did many changes in the code. I tried requests package as well as urllib but still getting error.
Sometimes I got timeout error or urllib.error.URLError:
urllib.error.URLError: <urlopen error Tunnel connection failed: 502 Proxy Error (no funds available)>
Here is my code.
import os, ssl, http, gzip
import urllib.request
from bs4 import BeautifulSoup
import re
from http.cookiejar import CookieJar
import json
import http
import requests
def myntraScraper(url):
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
ssl._create_default_https_context = ssl._create_unverified_context
cj = CookieJar()
proxy = {
'https': '------',
'http': '-------'
}
# user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
try:
import urllib.request as urllib2
except ImportError:
import urllib2
urllib2.install_opener(
urllib2.build_opener(
urllib2.ProxyHandler(proxy),
urllib.request.HTTPCookieProcessor(cj)
)
)
request = urllib2.Request(url, headers={
'accept-encoding': 'gzip',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
})
page = urllib2.urlopen(request)
html = gzip.decompress(page.read()).decode('utf-8')
soup = BeautifulSoup(html, 'lxml')
print(soup)
myntraScraper("https://www.myntra.com/sports-shoes/puma/puma-men-blue-hybrid-fuego-running-shoes/11203218/buy")
Currently, I am using Smartproxy. But I tried the same thing with PacketStream and Luminati. Most of the time I got the proxy error.
Myntra stores all the product data in a variable in a script variable called pdpData.
The below script gets the whole json that contains all the data regarding the product.
import requests, json
from bs4 import BeautifulSoup
headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'}
s = requests.Session()
res = s.get("https://www.myntra.com/sports-shoes/puma/puma-men-blue-hybrid-fuego-running-shoes/11203218/buy", headers=headers, verify=False)
soup = BeautifulSoup(res.text,"lxml")
script = None
for s in soup.find_all("script"):
if 'pdpData' in s.text:
script = s.get_text(strip=True)
break
print(json.loads(script[script.index('{'):]))
Output:
{'pdpData': {'id': 11203218, 'name': 'Puma Men Blue Hybrid Fuego Running Shoes', 'mrp': 6499, 'manufacturer': 'SSIPL RETAIL LIMITED, KUNDLI,75, SERSA ROAD, 131028 SONEPAT', 'countryOfOrigin': 'India', 'colours': None, 'baseColour': 'Blue', 'brand': {'uidx': '', 'name': 'Puma', 'image': '', 'bio': ''}, 'media': {'videos': [], 'albums': [{'name': 'default', 'images': [{'src': 'http://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/productimage/2019/12/20/0c15e03c-863b-4a4a-9bb7-709a733fd4821576816965952-1.jpg', 'secureSrc': 'https://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/productimage/2019/12/20/0c15e03c-863b-4a4a-9bb7-709a733fd4821576816965952-1.jpg', 'host': None, 'imageURL': 'http://assets.myntassets.com/assets/images/productimage/2019/12/20/0c15e03c-863b-4a4a-9bb7-709a733fd4821576816965952-1.jpg', 'annotation': []}, {'src': 'http://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/productimage/2019/12/20/69bfa4e0-1ac4-4adf-b84e-4815ff60e8831576816966007-2.jpg', 'secureSrc': 'https://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/productimage/2019/12/20/69bfa4e0-1ac4-4adf-b84e-4815ff60e8831576816966007-2.jpg', 'host': None, 'imageURL': 'http://assets.myntassets.com/assets/images/productimage/2019/12/20/69bfa4e0-1ac4-4adf-b84e-4815ff60e8831576816966007-2.jpg', 'annotation': []}, {'src': 'http://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/productimage/2019/12/20/d2fd0ca0-1643-43ae-a0fc-fb1309580e151576816966049-3.jpg', 'secureSrc': 'https://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/productimage/2019/12/20/d2fd0ca0-1643-43ae-a0fc-fb1309580e151576816966049-3.jpg', 'host': None, 'imageURL': 'http://assets.myntassets.com/assets/images/productimage/2019/12/20/d2fd0ca0-1643-43ae-a0fc-fb1309580e151576816966049-3.jpg', 'annotation': []}, {'src': 'http://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/productimage/2019/12/20/0edae428-b9c0-4755-9127-0961d872b78a1576816966095-4.jpg', 'secureSrc': 'https://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/productimage/2019/12/20/0edae428-b9c0-4755-9127-0961d872b78a1576816966095-4.jpg', 'host': None, 'imageURL': 'http://assets.myntassets.com/assets/images/productimage/2019/12/20/0edae428-b9c0-4755-9127-0961d872b78a1576816966095-4.jpg', 'annotation': []}, {'src': 'http://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/productimage/2019/12/20/c59c7677-2bbd-4dbe-9b02-7c321c29cb701576816966142-5.jpg', 'secureSrc': 'https://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/productimage/2019/12/20/c59c7677-2bbd-4dbe-9b02-7c321c29cb701576816966142-5.jpg', 'host': None, 'imageURL': 'http://assets.myntassets.com/assets/images/productimage/2019/12/20/c59c7677-2bbd-4dbe-9b02-7c321c29cb701576816966142-5.jpg', 'annotation': []}]}, {'name': 'animatedImage', 'images': []}]}, 'sbpEnabled': False, 'sizechart': {'sizeChartUrl': None, 'sizeRepresentationUrl': 'http://assets.myntassets.com/assets/images/sizechart/2016/12/12/11481538267795-footwear.png'}, 'sizeRecoLazy': {'actionType': 'lazy', 'action': '/product/11203218/size/recommendation', 'sizeProfileAction': '/user/size-profiles?gender=male&articleType=Sports%20Shoes'}, 'analytics': {'articleType': 'Sports Shoes', 'subCategory': 'Shoes', 'masterCategory': 'Footwear', 'gender': 'Men', 'brand': 'Puma', 'colourHexCode': None}, 'crossLinks': [{'title': 'More Sports Shoes by Puma', 'url': 'sports-shoes?f=Brand:Puma::Gender:men'}, {'title': 'More Blue Sports Shoes', 'url': 'sports-shoes?f=Color:Blue_0074D9::Gender:men'}, {'title': 'More Sports Shoes', 'url': 'sports-shoes?f=Gender:men'}], 'relatedStyles': None, 'disclaimerTitle': '', 'productDetails': [{'type': None, 'content': None, 'title': 'Product Details', 'description': "<b>FEATURES + BENEFITS</b><br>HYBRID: PUMA's combination of two of its best technologies: IGNITE foam and NRGY beads<br>IGNITE: PUMA's foam midsole and branded heel cage supports and stabilises by locking the heel onto the platform<br>NRGY: PUMA's foam midsole offers superior cushion from heel to toe so you can power through your run<br>Heel-to-toe drop: 12mm<br><br><b>Product Design Details</b><ul><li>A pair of blue & brown running sports shoes, has regular styling, lace-up detail</li><li>Low boot silhouette</li><li>Lightweight synthetic upper</li><li>Overlays to secure the heel</li><li>Classic tongue</li><li>Lace-up closure</li><li>Rubber outsole for traction and durability</li><li>PUMA Wordmark at the tongue</li><li>PUMA Cat Logo at heel</li><li>Warranty: 3 months</li><li>Warranty provided by brand/manufacturer</li></ul><br><b>PRODUCT STORY</b><br>Change the name of the game with the HYBRID Fuego running sneakers. This bold colour-blocked shoe pairs a HYBRID foam midsole and a grippy rubber outsole for the ultimate in comfort and stability while still maintaining a stylish edge."}, {'type': None, 'content': None, 'title': 'MATERIAL & CARE', 'description': 'Textile<br>Wipe with a clean, dry cloth to remove dust'}], 'preOrder': None, 'sizeChartDisclaimerText': '', 'tags': None, 'articleAttributes': {'Ankle Height': 'Regular', 'Arch Type': 'Medium', 'Cleats': 'No Cleats', 'Cushioning': 'Medium', 'Distance': 'Medium', 'Fastening': 'Lace-Ups', 'Material': 'Textile', 'Outsole Type': 'Marking', 'Pronation for Running Shoes': 'Neutral', 'Running Type': 'Road Running', 'Sole Material': 'Rubber', 'Sport': 'Running', 'Surface Type': 'Outdoor', 'Technology': 'NA', 'Warranty': '3 months'}, 'systemAttributes': [], 'ratings': None, 'urgency': [{'value': '0', 'type': 'PURCHASED', 'ptile': 0}, {'value': '0', 'type': 'CART', 'ptile': 0}, {'value': '0', 'type': 'WISHLIST', 'ptile': 0}, {'value': '0', 'type': 'PDP', 'ptile': 0}], 'catalogAttributes': {'catalogDate': '1576751286000', 'season': 'summer', 'year': '2020'}, 'productContentGroupEntries': [{'title': '', 'type': 'DETAILS', 'attributes': [{'attributeName': 'Product Details', 'attributeType': 'STRING', 'value': "<b>FEATURES + BENEFITS</b><br>HYBRID: PUMA's combination of two of its best technologies: IGNITE foam and NRGY beads<br>IGNITE: PUMA's foam midsole and branded heel cage supports and stabilises by locking the heel onto the platform<br>NRGY: PUMA's foam midsole offers superior cushion from heel to toe so you can power through your run<br>Heel-to-toe drop: 12mm<br><br><b>Product Design Details</b><ul><li>A pair of blue & brown running sports shoes, has regular styling, lace-up detail</li><li>Low boot silhouette</li><li>Lightweight synthetic upper</li><li>Overlays to secure the heel</li><li>Classic tongue</li><li>Lace-up closure</li><li>Rubber outsole for traction and durability</li><li>PUMA Wordmark at the tongue</li><li>PUMA Cat Logo at heel</li><li>Warranty: 3 months</li><li>Warranty provided by brand/manufacturer</li></ul><br><b>PRODUCT STORY</b><br>Change the name of the game with the HYBRID Fuego running sneakers. This bold colour-blocked shoe pairs a HYBRID foam midsole and a grippy rubber outsole for the ultimate in comfort and stability while still maintaining a stylish edge."}, {'attributeName': 'Material & Care', 'attributeType': 'STRING', 'value': 'Textile<br>Wipe with a clean, dry cloth to remove dust'}, {'attributeName': 'Style Note', 'attributeType': 'STRING', 'value': "You'll look and feel super stylish in these trendsetting sports shoes by Puma. Match this blue pair with track pants and a sleeveless sports T-shirt when heading out for a casual day with friends."}]}], 'shoppableLooks': None, 'descriptors': [{'title': 'description', 'description': "<b>FEATURES + BENEFITS</b><br>HYBRID: PUMA's combination of two of its best technologies: IGNITE foam and NRGY beads<br>IGNITE: PUMA's foam midsole and branded heel cage supports and stabilises by locking the heel onto the platform<br>NRGY: PUMA's foam midsole offers superior cushion from heel to toe so you can power through your run<br>Heel-to-toe drop: 12mm<br><br><b>Product Design Details</b><ul><li>A pair of blue & brown running sports shoes, has regular styling, lace-up detail</li><li>Low boot silhouette</li><li>Lightweight synthetic upper</li><li>Overlays to secure the heel</li><li>Classic tongue</li><li>Lace-up closure</li><li>Rubber outsole for traction and durability</li><li>PUMA Wordmark at the tongue</li><li>PUMA Cat Logo at heel</li><li>Warranty: 3 months</li><li>Warranty provided by brand/manufacturer</li></ul><br><b>PRODUCT STORY</b><br>Change the name of the game with the HYBRID Fuego running sneakers. This bold colour-blocked shoe pairs a HYBRID foam midsole and a grippy rubber outsole for the ultimate in comfort and stability while still maintaining a stylish edge."}, {'title': 'style_note', 'description': "You'll look and feel super stylish in these trendsetting sports shoes by Puma. Match this blue pair with track pants and a sleeveless sports T-shirt when heading out for a casual day with friends."}, {'title': 'materials_care_desc', 'description': 'Textile<br>Wipe with a clean, dry cloth to remove dust'}], 'flags': {'isExchangeable': True, 'isReturnable': True, 'openBoxPickupEnabled': True, 'tryAndBuyEnabled': True, 'isLarge': False, 'isHazmat': False, 'isFragile': False, 'isJewellery': False, 'outOfStock': False, 'codEnabled': True, 'globalStore': False, 'loyaltyPointsEnabled': False, 'emiEnabled': True, 'chatEnabled': False, 'measurementModeEnabled': False, 'sampleModeEnabled': False, 'disableBuyButton': False}, 'earlyBirdOffer': None, 'serviceability': {'launchDate': '', 'returnPeriod': 30, 'descriptors': ['Pay on delivery might be available', 'Easy 30 days returns and exchanges', 'Try & Buy might be available'], 'procurementTimeInDays': {'6206': 4}}, 'buyButtonSellerOrder': [{'skuId': 38724440, 'sellerPartnerId': 6206}, {'skuId': 38724442, 'sellerPartnerId': 6206}, {'skuId': 38724446, 'sellerPartnerId': 6206}, {'skuId': 38724450, 'sellerPartnerId': 6206}, {'skuId': 38724452, 'sellerPartnerId': 6206}, {'skuId': 38724444, 'sellerPartnerId': 6206}, {'skuId': 38724448, 'sellerPartnerId': 6206}], 'sellers': [{'sellerPartnerId': 6206, 'sellerName': 'Puma Sports India Pvt. Ltd.(NSCM)'}], 'sizes': [{'skuId': 38724440, 'styleId': 11203218, 'action': '/product/11203218/related/6?co=1', 'label': '6', 'available': True, 'sizeType': 'UK Size', 'originalStyle': True, 'measurements': [{'type': 'Body Measurement', 'name': 'To Fit Foot Length', 'value': '24.5', 'minValue': '24.5', 'maxValue': '24.5', 'unit': 'cm', 'displayText': '24.5cm'}], 'allSizesList': [{'scaleCode': 'uk_size', 'sizeValue': '6', 'size': 'UK Size', 'order': 1, 'prefix': 'UK'}, {'scaleCode': 'us_size', 'sizeValue': '7', 'size': 'US Size', 'order': 2, 'prefix': 'US'}, {'scaleCode': 'euro_size', 'sizeValue': '39', 'size': 'Euro Size', 'order': 3, 'prefix': 'EURO'}], 'sizeSellerData': [{'mrp': 6499, 'sellerPartnerId': 6206, 'availableCount': 32, 'sellableInventoryCount': 32, 'warehouses': ['106', '328'], 'supplyType': 'ON_HAND', 'discountId': '11203218:23363948', 'discountedPrice': 2924}]}, {'skuId': 38724442, 'styleId': 11203218, 'action': '/product/11203218/related/7?co=1', 'label': '7', 'available': True, 'sizeType': 'UK Size', 'originalStyle': True, 'measurements': [{'type': 'Body Measurement', 'name': 'To Fit Foot Length', 'value': '25.4', 'minValue': '25.4', 'maxValue': '25.4', 'unit': 'cm', 'displayText': '25.4cm'}], 'allSizesList': [{'scaleCode': 'uk_size', 'sizeValue': '7', 'size': 'UK Size', 'order': 1, 'prefix': 'UK'}, {'scaleCode': 'us_size', 'sizeValue': '8', 'size': 'US Size', 'order': 2, 'prefix': 'US'}, {'scaleCode': 'euro_size', 'sizeValue': '40.5', 'size': 'Euro Size', 'order': 3, 'prefix': 'EURO'}], 'sizeSellerData': [{'mrp': 6499, 'sellerPartnerId': 6206, 'availableCount': 86, 'sellableInventoryCount': 86, 'warehouses': ['106'], 'supplyType': 'ON_HAND', 'discountId': '11203218:23363948', 'discountedPrice': 2924}]}, {'skuId': 38724444, 'styleId': 11203218, 'action': '/product/11203218/related/8?co=1', 'label': '8', 'available': True, 'sizeType': 'UK Size', 'originalStyle': True, 'measurements': [{'type': 'Body Measurement', 'name': 'To Fit Foot Length', 'value': '26.2', 'minValue': '26.2', 'maxValue': '26.2', 'unit': 'cm', 'displayText': '26.2cm'}], 'allSizesList': [{'scaleCode': 'uk_size', 'sizeValue': '8', 'size': 'UK Size', 'order': 1, 'prefix': 'UK'}, {'scaleCode': 'us_size', 'sizeValue': '9', 'size': 'US Size', 'order': 2, 'prefix': 'US'}, {'scaleCode': 'euro_size', 'sizeValue': '42', 'size': 'Euro Size', 'order': 3, 'prefix': 'EURO'}], 'sizeSellerData': [{'mrp': 6499, 'sellerPartnerId': 6206, 'availableCount': 188, 'sellableInventoryCount': 188, 'warehouses': ['106'], 'supplyType': 'ON_HAND', 'discountId': '11203218:23363948', 'discountedPrice': 2924}]}, {'skuId': 38724446, 'styleId': 11203218, 'action': '/product/11203218/related/9?co=1', 'label': '9', 'available': True, 'sizeType': 'UK Size', 'originalStyle': True, 'measurements': [{'type': 'Body Measurement', 'name': 'To Fit Foot Length', 'value': '27.1', 'minValue': '27.1', 'maxValue': '27.1', 'unit': 'cm', 'displayText': '27.1cm'}], 'allSizesList': [{'scaleCode': 'uk_size', 'sizeValue': '9', 'size': 'UK Size', 'order': 1, 'prefix': 'UK'}, {'scaleCode': 'us_size', 'sizeValue': '10', 'size': 'US Size', 'order': 2, 'prefix': 'US'}, {'scaleCode': 'euro_size', 'sizeValue': '43', 'size': 'Euro Size', 'order': 3, 'prefix': 'EURO'}], 'sizeSellerData': [{'mrp': 6499, 'sellerPartnerId': 6206, 'availableCount': 163, 'sellableInventoryCount': 163, 'warehouses': ['106'], 'supplyType': 'ON_HAND', 'discountId': '11203218:23363948', 'discountedPrice': 2924}]}, {'skuId': 38724448, 'styleId': 11203218, 'action': '/product/11203218/related/10?co=1', 'label': '10', 'available': True, 'sizeType': 'UK Size', 'originalStyle': True, 'measurements': [{'type': 'Body Measurement', 'name': 'To Fit Foot Length', 'value': '27.9', 'minValue': '27.9', 'maxValue': '27.9', 'unit': 'cm', 'displayText': '27.9cm'}], 'allSizesList': [{'scaleCode': 'uk_size', 'sizeValue': '10', 'size': 'UK Size', 'order': 1, 'prefix': 'UK'}, {'scaleCode': 'us_size', 'sizeValue': '11', 'size': 'US Size', 'order': 2, 'prefix': 'US'}, {'scaleCode': 'euro_size', 'sizeValue': '44.5', 'size': 'Euro Size', 'order': 3, 'prefix': 'EURO'}], 'sizeSellerData': [{'mrp': 6499, 'sellerPartnerId': 6206, 'availableCount': 153, 'sellableInventoryCount': 153, 'warehouses': ['106'], 'supplyType': 'ON_HAND', 'discountId': '11203218:23363948', 'discountedPrice': 2924}]}, {'skuId': 38724450, 'styleId': 11203218, 'action': '/product/11203218/related/11?co=1', 'label': '11', 'available': True, 'sizeType': 'UK Size', 'originalStyle': True, 'measurements': [{'type': 'Body Measurement', 'name': 'To Fit Foot Length', 'value': '28.8', 'minValue': '28.8', 'maxValue': '28.8', 'unit': 'cm', 'displayText': '28.8cm'}], 'allSizesList': [{'scaleCode': 'uk_size', 'sizeValue': '11', 'size': 'UK Size', 'order': 1, 'prefix': 'UK'}, {'scaleCode': 'us_size', 'sizeValue': '12', 'size': 'US Size', 'order': 2, 'prefix': 'US'}, {'scaleCode': 'euro_size', 'sizeValue': '46', 'size': 'Euro Size', 'order': 3, 'prefix': 'EURO'}], 'sizeSellerData': [{'mrp': 6499, 'sellerPartnerId': 6206, 'availableCount': 43, 'sellableInventoryCount': 43, 'warehouses': ['106'], 'supplyType': 'ON_HAND', 'discountId': '11203218:23363948', 'discountedPrice': 2924}]}, {'skuId': 38724452, 'styleId': 11203218, 'action': '/product/11203218/related/12?co=1', 'label': '12', 'available': False, 'sizeType': 'UK Size', 'originalStyle': True, 'measurements': [{'type': 'Body Measurement', 'name': 'To Fit Foot Length', 'value': '29.6', 'minValue': '29.6', 'maxValue': '29.6', 'unit': 'cm', 'displayText': '29.6cm'}], 'allSizesList': [{'scaleCode': 'uk_size', 'sizeValue': '12', 'size': 'UK Size', 'order': 1, 'prefix': 'UK'}, {'scaleCode': 'us_size', 'sizeValue': '13', 'size': 'US Size', 'order': 2, 'prefix': 'US'}, {'scaleCode': 'euro_size', 'sizeValue': '47', 'size': 'Euro Size', 'order': 3, 'prefix': 'EURO'}], 'sizeSellerData': []}], 'discounts': [{'type': 1, 'freeItem': False, 'label': '(55% OFF)', 'discountText': '', 'timerStart': '0', 'timerEnd': '1597084200', 'discountPercent': 55, 'offer': '', 'discountId': '11203218:23363948', 'heading': None, 'description': None, 'link': None, 'freeItemImage': None}], 'offers': [{'type': 'EMI', 'title': 'EMI option available', 'description': '', 'action': '/faqs', 'image': None}], 'bundledSkus': None, 'richPdp': None, 'landingPageUrl': 'sports-shoes/puma/puma-men-blue-hybrid-fuego-running-shoes/11203218/buy'}, 'pageName': 'Pdp', 'atsa': ['Sport', 'Material', 'Fastening', 'Ankle Height', 'Outsole Type', 'Cleats', 'Pronation for Running Shoes', 'Arch Type', 'Cushioning', 'Running Type', 'Warranty', 'Distance', 'Number of Components', 'Surface Type', 'Technology']}
I was trying to convert a set of parquet files into delta format in-place. I tried using the CONVERT command as mentioned in the Databricks documentation. https://docs.databricks.com/spark/latest/spark-sql/language-manual/convert-to-delta.html
CONVERT TO DELTA parquet.'path/to/table'
I am using Spark 2.4.4 and PySpark (Python version 3.5.3). This is the command I am executing
spark.sql("CONVERT TO DELTA parquet. '/usr/spark-2.4.4/data/delta-parquet/'")
where '/usr/spark-2.4.4/data/delta-parquet/' is the path where the parquet files are located.
But, I am getting an exception.
File "/usr/spark-2.4.4/python/pyspark/sql/utils.py", line 63, in deco
return f(*a, **kw)
File "/usr/spark-2.4.4/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o25.sql.
: org.apache.spark.sql.catalyst.parser.ParseException:
mismatched input 'CONVERT' expecting {'(', 'SELECT', 'FROM', 'ADD', 'DESC', 'WITH', 'VALUES', 'CREATE', 'TABLE', 'INSERT', 'DELETE', 'DESCRIBE', 'EXPLAIN', 'SHOW', 'USE', 'DROP', 'ALTER', 'MAP', 'SET', 'RESET', 'START', 'COMMIT', 'ROLLBACK', 'REDUCE', 'REFRESH', 'CLEAR', 'CACHE', 'UNCACHE', 'DFS', 'TRUNCATE', 'ANALYZE', 'LIST', 'REVOKE', 'GRANT', 'LOCK', 'UNLOCK', 'MSCK', 'EXPORT', 'IMPORT', 'LOAD'}(line 1, pos 0)
== SQL ==
CONVERT TO DELTA parquet. '/usr/spark-2.4.4/data/delta-parquet/'
^^^
at org.apache.spark.sql.catalyst.parser.ParseException.withCommand(ParseDriver.scala:241)
at org.apache.spark.sql.catalyst.parser.AbstractSqlParser.parse(ParseDriver.scala:117)
at org.apache.spark.sql.execution.SparkSqlParser.parse(SparkSqlParser.scala:48)
at org.apache.spark.sql.catalyst.parser.AbstractSqlParser.parsePlan(ParseDriver.scala:69)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:642)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/spark-2.4.4/python/pyspark/sql/session.py", line 767, in sql
return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)
File "/usr/spark-2.4.4/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/usr/spark-2.4.4/python/pyspark/sql/utils.py", line 73, in deco
raise ParseException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.ParseException: "\nmismatched input 'CONVERT' expecting {'(', 'SELECT', 'FROM', 'ADD', 'DESC', 'WITH', 'VALUES', 'CREATE', 'TABLE', 'INSERT', 'DELETE', 'DESCRIBE', 'EXPLAIN', 'SHOW', 'USE', 'DROP', 'ALTER', 'MAP', 'SET', 'RESET', 'START', 'COMMIT', 'ROLLBACK', 'REDUCE', 'REFRESH', 'CLEAR', 'CACHE', 'UNCACHE', 'DFS', 'TRUNCATE', 'ANALYZE', 'LIST', 'REVOKE', 'GRANT', 'LOCK', 'UNLOCK', 'MSCK', 'EXPORT', 'IMPORT', 'LOAD'}(line 1, pos 0)\n\n== SQL ==\nCONVERT TO DELTA parquet. '/usr/spark-2.4.4/data/delta-parquet/'\n^^^\n"
Am I using the CONVERT command in the right way? Any help would be appreciated.
For PySpark, using the latest Delta Lake version, you can convert as follows:
from delta.tables import *
deltaTable = DeltaTable.convertToDelta(spark, "parquet.`/usr/spark-2.4.4/data/delta-parquet/`")
This example is taken from the docs
Just a syntax error, you are using the CONVERT command in the right way;
CONVERT TO DELTA parquet.`/usr/spark-2.4.4/data/delta-parquet/`
Use Backtick and remove unnecessary spaces.
Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 3 years ago.
Improve this question
['90', '80', '70', '60', '50', '40', '30', '20', '10']
['09', '08', '07', '06', '05', '04', '03', '02', '01']
['11', '12', '13', '14', '15', '16', '17', '18', '19']
['29', '28', '27', '26', '25', '24', '23', '22', '21']
['31', '32', '33', '34', '35', '36', '37', '38', '39']
['49', '48', '47', '46', '45', '44', '43', '42', '41']
['51', '52', '53', '54', '55', '56', '57', '58', '59']
['69', '68', '67', '66', '65', '64', '63', '62', '61']
['71', '72', '73', '74', '75', '76', '77', '78', '79']
How to print the first list only? And
How to print the second and fifth lists only?
lists = [['90', '80', '70', '60', '50', '40', '30', '20', '10'],
['09', '08', '07', '06', '05', '04', '03', '02', '01'],
['11', '12', '13', '14', '15', '16', '17', '18', '19'],
['29', '28', '27', '26', '25', '24', '23', '22', '21'],
['31', '32', '33', '34', '35', '36', '37', '38', '39'],
['49', '48', '47', '46', '45', '44', '43', '42', '41'],
['51', '52', '53', '54', '55', '56', '57', '58', '59'],
['69', '68', '67', '66', '65', '64', '63', '62', '61'],
['71', '72', '73', '74', '75', '76', '77', '78', '79']]
print(lists[0], "\n", lists[8])
It prints the first i.e [0] and the last i.e [8]
print(lists[0], "\n", lists[1], "\n", lists[4])
And this one answers your question 1st, 2nd, 5th
If what you have is a list of lists, you access the inner lists in exactly the same way you access any element in a list.
Let's say you have a variable x:
[[1, 2, 3], [4, 5], [6, 7, 8, 9]]
The first list in that variable is x[0], [1, 2, 3].
The first and third lists can be gotten (into another list of lists) with [x[0], x[2]].
By way of comparison, getting the second item from the third list in that original list of lists (the 7) can be done with x[2][1].
# somefile.txt
['90', '80', '70', '60', '50', '40', '30', '20', '10']
['09', '08', '07', '06', '05', '04', '03', '02', '01']
['11', '12', '13', '14', '15', '16', '17', '18', '19']
['29', '28', '27', '26', '25', '24', '23', '22', '21']
['31', '32', '33', '34', '35', '36', '37', '38', '39']
['49', '48', '47', '46', '45', '44', '43', '42', '41']
['51', '52', '53', '54', '55', '56', '57', '58', '59']
['69', '68', '67', '66', '65', '64', '63', '62', '61']
['71', '72', '73', '74', '75', '76', '77', '78', '71']
# main.py
list = []
with open('somefile.txt', 'r') as f:
for line in f:
list.append(line)
print("first: ", list[0], "\n", "second: ", list[1], "\n", "third: ", list[4])
To print 1st list, 2nd list and 5th list
Getting an error when trying to update a column in the db in my pipelines file, set_data_update function.
What I am trying to do is use the function get_data to return the url and the price, for each url that is returned, call the set_data_update function, where I will swap the existing new_price into old_price, and then put
the new scraped price into new_price. It seems like my call to set_data_update in get_data always runs twice. It should run once, because at the moment I only have one row in the DB for the 2nd URL -
"https://www.amazon.com/Hamilton-Beach-46310-Programmable-Coffee/dp/B07684BPLB/ref=sr_1_10?keywords=coffee+maker&qid=1559098604&s=home-garden&sr=1-10".
Also I see an error in the traceback
sqlite3.OperationalError: unrecognized token: ":"
products.json
{
"itemdata": [
{ "url": "https://www.amazon.com/dp/B07GWKT87L/?`coliid=I36XKNB8MLE3&colid=KRASGH7290D0&psc=0&ref_=lv_ov_lig_dp_it#customerReview",`
"title": "coffee_maker_black_and_decker",
"name": "Cobi Maguire",
"email": "cobi#noemail.com"
},
{ "url": "https://www.amazon.com/Hamilton-Beach-46310-Programmable-Coffee/dp/B07684BPLB/ref=sr_1_10?keywords=coffee+maker&qid=1559098604&s=home-garden&sr=1-10",
"title": "coffee_maker_hamilton_beach",
"name": "Ryan Murphy",
"email": "ryan#noemail.com"
}
]
}
Error Traceback- Traceback (most recent call last):
(price_monitor) C:\Users\hassy\Documents\python_venv\price_monitor\price_monitor>scrapy crawl price_monitor
2019-06-15 17:00:10 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: price_monitor)
2019-06-15 17:00:10 [scrapy.utils.log] INFO: Versions: lxml 4.3.3.0, libxml2 2.9.5, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 19.2.0, Python 3.6.5 (v3.6.5:f59c0932b4, Mar 28 2018, 16:07:46) [MSC v.1900 32 bit (Intel)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1b 26 Feb 2019), cryptography 2.6.1, Platform Windows-10-10.0.17134-SP0
2019-06-15 17:00:10 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'price_monitor', 'NEWSPIDER_MODULE': 'price_monitor.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['price_monitor.spiders'], 'USER_AGENT': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
2019-06-15 17:00:10 [scrapy.extensions.telnet] INFO: Telnet Password: 3c0578dfed20521c
2019-06-15 17:00:10 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2019-06-15 17:00:10 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2019-06-15 17:00:10 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2019-06-15 17:00:10 [scrapy.middleware] INFO: Enabled item pipelines:
['price_monitor.pipelines.PriceMonitorPipeline']
2019-06-15 17:00:10 [scrapy.core.engine] INFO: Spider opened
2019-06-15 17:00:10 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2019-06-15 17:00:10 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2019-06-15 17:00:11 [scrapy.core.engine] DEBUG: Crawled (200) https://www.amazon.com/robots.txt> (referer: None)
2019-06-15 17:00:11 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to https://www.amazon.com/BLACK-DECKER-CM4202S-Programmable-Coffeemaker/dp/B07GWKT87L> from https://www.amazon.com/dp/B07GWKT87L/?coliid=I36XKNB8MLE3&colid=KRASGH7290D0&psc=0&ref_=lv_ov_lig_dp_it#customerReview>
2019-06-15 17:00:11 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to https://www.amazon.com/Hamilton-Beach-46310-Programmable-Coffee/dp/B07684BPLB> from https://www.amazon.com/Hamilton-Beach-46310-Programmable-Coffee/dp/B07684BPLB/ref=sr_1_10?keywords=coffee+maker&qid=1559098604&s=home-garden&sr=1-10>
2019-06-15 17:00:12 [scrapy.core.engine] DEBUG: Crawled (200) https://www.amazon.com/BLACK-DECKER-CM4202S-Programmable-Coffeemaker/dp/B07GWKT87L> (referer: None)
2019-06-15 17:00:12 [scrapy.core.engine] DEBUG: Crawled (200) https://www.amazon.com/Hamilton-Beach-46310-Programmable-Coffee/dp/B07684BPLB> (referer: None)
Printing rows
('https://www.amazon.com/Hamilton-Beach-46310-Programmable-Coffee/dp/B07684BPLB/ref=sr_1_10?keywords=coffee+maker&qid=1559098604&s=home-garden&sr=1-10', '$37.99')
calling func
2019-06-15 17:00:12 [scrapy.core.scraper] ERROR: Error processing {'email': 'ryan#noemail.com',
'name': 'Ryan Murphy',
'price': '$49.99',
'title': 'BLACK+DECKER CM4202S Select-A-Size Easy Dial Programmable '
'Coffeemaker, Extra Large 80 ounce Capacity, Stainless Steel',
'url': 'h'}
Traceback (most recent call last):
File "c:\users\hassy\documents\python_venv\price_monitor\lib\site-packages\twisted\internet\defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "c:\users\hassy\documents\python_venv\price_monitor\price_monitor\pipelines.py", line 37, in process_item
self.get_data(item)
File "c:\users\hassy\documents\python_venv\price_monitor\price_monitor\pipelines.py", line 60, in get_data
self.set_data_update(item, url, new_price)
File "c:\users\hassy\documents\python_venv\price_monitor\price_monitor\pipelines.py", line 88, in set_data_update
{'old_price': old_price, 'new_price': item['price']})
sqlite3.OperationalError: unrecognized token: ":"
Printing rows
('https://www.amazon.com/Hamilton-Beach-46310-Programmable-Coffee/dp/B07684BPLB/ref=sr_1_10?keywords=coffee+maker&qid=1559098604&s=home-garden&sr=1-10', '$37.99')
calling func
2019-06-15 17:00:12 [scrapy.core.scraper] ERROR: Error processing {'email': 'ryan#noemail.com',
'name': 'Ryan Murphy',
'price': '$34.99',
'title': 'Hamilton Beach 46310 Programmable Coffee Maker, 12 Cups, Black',
'url': 'h'}
Traceback (most recent call last):
File "c:\users\hassy\documents\python_venv\price_monitor\lib\site-packages\twisted\internet\defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "c:\users\hassy\documents\python_venv\price_monitor\price_monitor\pipelines.py", line 37, in process_item
self.get_data(item)
File "c:\users\hassy\documents\python_venv\price_monitor\price_monitor\pipelines.py", line 60, in get_data
self.set_data_update(item, url, new_price)
File "c:\users\hassy\documents\python_venv\price_monitor\price_monitor\pipelines.py", line 88, in set_data_update
{'old_price': old_price, 'new_price': item['price']})
sqlite3.OperationalError: unrecognized token: ":"
2019-06-15 17:00:12 [scrapy.core.engine] INFO: Closing spider (finished)
2019-06-15 17:00:12 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1888,
'downloader/request_count': 5,
'downloader/request_method_count/GET': 5,
'downloader/response_bytes': 261495,
'downloader/response_count': 5,
'downloader/response_status_count/200': 3,
'downloader/response_status_count/301': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2019, 6, 15, 21, 0, 12, 534906),
'log_count/DEBUG': 5,
'log_count/ERROR': 2,
'log_count/INFO': 9,
'response_received_count': 3,
'robotstxt/request_count': 1,
'robotstxt/response_count': 1,
'robotstxt/response_status_count/200': 1,
'scheduler/dequeued': 4,
'scheduler/dequeued/memory': 4,
'scheduler/enqueued': 4,
'scheduler/enqueued/memory': 4,
'start_time': datetime.datetime(2019, 6, 15, 21, 0, 10, 799145)}
2019-06-15 17:00:12 [scrapy.core.engine] INFO: Spider closed (finished)
(price_monitor) C:\Users\hassy\Documents\python_venv\price_monitor\price_monitor>
pipelines.py
import sqlite3
class PriceMonitorPipeline(object):
def __init__(self):
self.create_connection()
self.create_table()
def create_connection(self):
self.conn = sqlite3.connect("price_monitor.db")
self.curr = self.conn.cursor()
def process_item(self, item, spider):
# self.store_data(item)
print("printing items")
print(item['title'])
print(item['price'])
self.get_data(item)
return item
def get_data(self, item):
""" Check if the row already exists for this url """
rows = 0
url = ''
new_price = ''
self.rows = rows
self.url = url
self.new_price = new_price
self.curr.execute("""select url, new_price from price_monitor WHERE url =:url""",
{'url': item['url']})
rows = self.curr.fetchone()
print("Printing rows")
print(rows)
rows_url = rows[0]
new_price = rows[1]
if rows is not None:
for item['url'] in rows_url:
print("calling func")
self.set_data_update(item, url, new_price)
else:
pass
def set_data_update(self, item, url, new_price):
url = 'https://www.amazon.com/Hamilton-Beach-46310-Programmable-Coffee/dp/B07684BPLB/ref=sr_1_10?keywords=coffee+maker&qid=1559098604&s=home-garden&sr=1-10'
old_price = new_price
price = item['price']
print("printing old price")
print(old_price)
print("New Price".format(item['price']))
self.curr.execute("""update price_monitor SET old_price=: old_price, new_price=: new_price
WHERE url=: url""",
{'old_price': old_price, 'new_price': price})
self.conn.commit()
items.py
import scrapy
class AmazonItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
price = scrapy.Field()
name = scrapy.Field()
email = scrapy.Field()
Spider
import scrapy
import json
import sys
from ..items import AmazonItem
class MySpider(scrapy.Spider):
name = 'price_monitor'
newlist = []
start_urls = []
itemdatalist = []
with open('C:\\Users\\hassy\\Documents\\python_venv\\price_monitor\\price_monitor\\products.json') as f:
data = json.load(f)
itemdatalist = data['itemdata']
for item in itemdatalist:
start_urls.append(item['url'])
def start_requests(self):
for item in MySpider.start_urls:
yield scrapy.Request(url=item, callback=self.parse)
def parse(self, response):
for url in MySpider.start_urls:
scrapeitem = AmazonItem()
title = response.css('span#productTitle::text').extract_first()
title = title.strip()
price = response.css('span#priceblock_ourprice::text').extract_first()
scrapeitem['title'] = title
scrapeitem['price'] = price
for item in MySpider.data['itemdata']:
url = item['url']
name = item['name']
email = item['email']
scrapeitem['url'] = url
scrapeitem['name'] = name
scrapeitem['email'] = email
yield scrapeitem