Unable To Scrape Myntra - python-3.x

I am trying to scrape Myntra but I got errors. I did many changes in the code. I tried requests package as well as urllib but still getting error.
Sometimes I got timeout error or urllib.error.URLError:
urllib.error.URLError: <urlopen error Tunnel connection failed: 502 Proxy Error (no funds available)>
Here is my code.
import os, ssl, http, gzip
import urllib.request
from bs4 import BeautifulSoup
import re
from http.cookiejar import CookieJar
import json
import http
import requests
def myntraScraper(url):
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
ssl._create_default_https_context = ssl._create_unverified_context
cj = CookieJar()
proxy = {
'https': '------',
'http': '-------'
}
# user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
try:
import urllib.request as urllib2
except ImportError:
import urllib2
urllib2.install_opener(
urllib2.build_opener(
urllib2.ProxyHandler(proxy),
urllib.request.HTTPCookieProcessor(cj)
)
)
request = urllib2.Request(url, headers={
'accept-encoding': 'gzip',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
})
page = urllib2.urlopen(request)
html = gzip.decompress(page.read()).decode('utf-8')
soup = BeautifulSoup(html, 'lxml')
print(soup)
myntraScraper("https://www.myntra.com/sports-shoes/puma/puma-men-blue-hybrid-fuego-running-shoes/11203218/buy")
Currently, I am using Smartproxy. But I tried the same thing with PacketStream and Luminati. Most of the time I got the proxy error.

Myntra stores all the product data in a variable in a script variable called pdpData.
The below script gets the whole json that contains all the data regarding the product.
import requests, json
from bs4 import BeautifulSoup
headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'}
s = requests.Session()
res = s.get("https://www.myntra.com/sports-shoes/puma/puma-men-blue-hybrid-fuego-running-shoes/11203218/buy", headers=headers, verify=False)
soup = BeautifulSoup(res.text,"lxml")
script = None
for s in soup.find_all("script"):
if 'pdpData' in s.text:
script = s.get_text(strip=True)
break
print(json.loads(script[script.index('{'):]))
Output:
{'pdpData': {'id': 11203218, 'name': 'Puma Men Blue Hybrid Fuego Running Shoes', 'mrp': 6499, 'manufacturer': 'SSIPL RETAIL LIMITED, KUNDLI,75, SERSA ROAD, 131028 SONEPAT', 'countryOfOrigin': 'India', 'colours': None, 'baseColour': 'Blue', 'brand': {'uidx': '', 'name': 'Puma', 'image': '', 'bio': ''}, 'media': {'videos': [], 'albums': [{'name': 'default', 'images': [{'src': 'http://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/productimage/2019/12/20/0c15e03c-863b-4a4a-9bb7-709a733fd4821576816965952-1.jpg', 'secureSrc': 'https://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/productimage/2019/12/20/0c15e03c-863b-4a4a-9bb7-709a733fd4821576816965952-1.jpg', 'host': None, 'imageURL': 'http://assets.myntassets.com/assets/images/productimage/2019/12/20/0c15e03c-863b-4a4a-9bb7-709a733fd4821576816965952-1.jpg', 'annotation': []}, {'src': 'http://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/productimage/2019/12/20/69bfa4e0-1ac4-4adf-b84e-4815ff60e8831576816966007-2.jpg', 'secureSrc': 'https://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/productimage/2019/12/20/69bfa4e0-1ac4-4adf-b84e-4815ff60e8831576816966007-2.jpg', 'host': None, 'imageURL': 'http://assets.myntassets.com/assets/images/productimage/2019/12/20/69bfa4e0-1ac4-4adf-b84e-4815ff60e8831576816966007-2.jpg', 'annotation': []}, {'src': 'http://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/productimage/2019/12/20/d2fd0ca0-1643-43ae-a0fc-fb1309580e151576816966049-3.jpg', 'secureSrc': 'https://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/productimage/2019/12/20/d2fd0ca0-1643-43ae-a0fc-fb1309580e151576816966049-3.jpg', 'host': None, 'imageURL': 'http://assets.myntassets.com/assets/images/productimage/2019/12/20/d2fd0ca0-1643-43ae-a0fc-fb1309580e151576816966049-3.jpg', 'annotation': []}, {'src': 'http://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/productimage/2019/12/20/0edae428-b9c0-4755-9127-0961d872b78a1576816966095-4.jpg', 'secureSrc': 'https://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/productimage/2019/12/20/0edae428-b9c0-4755-9127-0961d872b78a1576816966095-4.jpg', 'host': None, 'imageURL': 'http://assets.myntassets.com/assets/images/productimage/2019/12/20/0edae428-b9c0-4755-9127-0961d872b78a1576816966095-4.jpg', 'annotation': []}, {'src': 'http://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/productimage/2019/12/20/c59c7677-2bbd-4dbe-9b02-7c321c29cb701576816966142-5.jpg', 'secureSrc': 'https://assets.myntassets.com/h_($height),q_($qualityPercentage),w_($width)/v1/assets/images/productimage/2019/12/20/c59c7677-2bbd-4dbe-9b02-7c321c29cb701576816966142-5.jpg', 'host': None, 'imageURL': 'http://assets.myntassets.com/assets/images/productimage/2019/12/20/c59c7677-2bbd-4dbe-9b02-7c321c29cb701576816966142-5.jpg', 'annotation': []}]}, {'name': 'animatedImage', 'images': []}]}, 'sbpEnabled': False, 'sizechart': {'sizeChartUrl': None, 'sizeRepresentationUrl': 'http://assets.myntassets.com/assets/images/sizechart/2016/12/12/11481538267795-footwear.png'}, 'sizeRecoLazy': {'actionType': 'lazy', 'action': '/product/11203218/size/recommendation', 'sizeProfileAction': '/user/size-profiles?gender=male&articleType=Sports%20Shoes'}, 'analytics': {'articleType': 'Sports Shoes', 'subCategory': 'Shoes', 'masterCategory': 'Footwear', 'gender': 'Men', 'brand': 'Puma', 'colourHexCode': None}, 'crossLinks': [{'title': 'More Sports Shoes by Puma', 'url': 'sports-shoes?f=Brand:Puma::Gender:men'}, {'title': 'More Blue Sports Shoes', 'url': 'sports-shoes?f=Color:Blue_0074D9::Gender:men'}, {'title': 'More Sports Shoes', 'url': 'sports-shoes?f=Gender:men'}], 'relatedStyles': None, 'disclaimerTitle': '', 'productDetails': [{'type': None, 'content': None, 'title': 'Product Details', 'description': "<b>FEATURES + BENEFITS</b><br>HYBRID: PUMA's combination of two of its best technologies: IGNITE foam and NRGY beads<br>IGNITE: PUMA's foam midsole and branded heel cage supports and stabilises by locking the heel onto the platform<br>NRGY: PUMA's foam midsole offers superior cushion from heel to toe so you can power through your run<br>Heel-to-toe drop: 12mm<br><br><b>Product Design Details</b><ul><li>A pair of blue & brown running sports shoes, has regular styling, lace-up detail</li><li>Low boot silhouette</li><li>Lightweight synthetic upper</li><li>Overlays to secure the heel</li><li>Classic tongue</li><li>Lace-up closure</li><li>Rubber outsole for traction and durability</li><li>PUMA Wordmark at the tongue</li><li>PUMA Cat Logo at heel</li><li>Warranty: 3 months</li><li>Warranty provided by brand/manufacturer</li></ul><br><b>PRODUCT STORY</b><br>Change the name of the game with the HYBRID Fuego running sneakers. This bold colour-blocked shoe pairs a HYBRID foam midsole and a grippy rubber outsole for the ultimate in comfort and stability while still maintaining a stylish edge."}, {'type': None, 'content': None, 'title': 'MATERIAL & CARE', 'description': 'Textile<br>Wipe with a clean, dry cloth to remove dust'}], 'preOrder': None, 'sizeChartDisclaimerText': '', 'tags': None, 'articleAttributes': {'Ankle Height': 'Regular', 'Arch Type': 'Medium', 'Cleats': 'No Cleats', 'Cushioning': 'Medium', 'Distance': 'Medium', 'Fastening': 'Lace-Ups', 'Material': 'Textile', 'Outsole Type': 'Marking', 'Pronation for Running Shoes': 'Neutral', 'Running Type': 'Road Running', 'Sole Material': 'Rubber', 'Sport': 'Running', 'Surface Type': 'Outdoor', 'Technology': 'NA', 'Warranty': '3 months'}, 'systemAttributes': [], 'ratings': None, 'urgency': [{'value': '0', 'type': 'PURCHASED', 'ptile': 0}, {'value': '0', 'type': 'CART', 'ptile': 0}, {'value': '0', 'type': 'WISHLIST', 'ptile': 0}, {'value': '0', 'type': 'PDP', 'ptile': 0}], 'catalogAttributes': {'catalogDate': '1576751286000', 'season': 'summer', 'year': '2020'}, 'productContentGroupEntries': [{'title': '', 'type': 'DETAILS', 'attributes': [{'attributeName': 'Product Details', 'attributeType': 'STRING', 'value': "<b>FEATURES + BENEFITS</b><br>HYBRID: PUMA's combination of two of its best technologies: IGNITE foam and NRGY beads<br>IGNITE: PUMA's foam midsole and branded heel cage supports and stabilises by locking the heel onto the platform<br>NRGY: PUMA's foam midsole offers superior cushion from heel to toe so you can power through your run<br>Heel-to-toe drop: 12mm<br><br><b>Product Design Details</b><ul><li>A pair of blue & brown running sports shoes, has regular styling, lace-up detail</li><li>Low boot silhouette</li><li>Lightweight synthetic upper</li><li>Overlays to secure the heel</li><li>Classic tongue</li><li>Lace-up closure</li><li>Rubber outsole for traction and durability</li><li>PUMA Wordmark at the tongue</li><li>PUMA Cat Logo at heel</li><li>Warranty: 3 months</li><li>Warranty provided by brand/manufacturer</li></ul><br><b>PRODUCT STORY</b><br>Change the name of the game with the HYBRID Fuego running sneakers. This bold colour-blocked shoe pairs a HYBRID foam midsole and a grippy rubber outsole for the ultimate in comfort and stability while still maintaining a stylish edge."}, {'attributeName': 'Material & Care', 'attributeType': 'STRING', 'value': 'Textile<br>Wipe with a clean, dry cloth to remove dust'}, {'attributeName': 'Style Note', 'attributeType': 'STRING', 'value': "You'll look and feel super stylish in these trendsetting sports shoes by Puma. Match this blue pair with track pants and a sleeveless sports T-shirt when heading out for a casual day with friends."}]}], 'shoppableLooks': None, 'descriptors': [{'title': 'description', 'description': "<b>FEATURES + BENEFITS</b><br>HYBRID: PUMA's combination of two of its best technologies: IGNITE foam and NRGY beads<br>IGNITE: PUMA's foam midsole and branded heel cage supports and stabilises by locking the heel onto the platform<br>NRGY: PUMA's foam midsole offers superior cushion from heel to toe so you can power through your run<br>Heel-to-toe drop: 12mm<br><br><b>Product Design Details</b><ul><li>A pair of blue & brown running sports shoes, has regular styling, lace-up detail</li><li>Low boot silhouette</li><li>Lightweight synthetic upper</li><li>Overlays to secure the heel</li><li>Classic tongue</li><li>Lace-up closure</li><li>Rubber outsole for traction and durability</li><li>PUMA Wordmark at the tongue</li><li>PUMA Cat Logo at heel</li><li>Warranty: 3 months</li><li>Warranty provided by brand/manufacturer</li></ul><br><b>PRODUCT STORY</b><br>Change the name of the game with the HYBRID Fuego running sneakers. This bold colour-blocked shoe pairs a HYBRID foam midsole and a grippy rubber outsole for the ultimate in comfort and stability while still maintaining a stylish edge."}, {'title': 'style_note', 'description': "You'll look and feel super stylish in these trendsetting sports shoes by Puma. Match this blue pair with track pants and a sleeveless sports T-shirt when heading out for a casual day with friends."}, {'title': 'materials_care_desc', 'description': 'Textile<br>Wipe with a clean, dry cloth to remove dust'}], 'flags': {'isExchangeable': True, 'isReturnable': True, 'openBoxPickupEnabled': True, 'tryAndBuyEnabled': True, 'isLarge': False, 'isHazmat': False, 'isFragile': False, 'isJewellery': False, 'outOfStock': False, 'codEnabled': True, 'globalStore': False, 'loyaltyPointsEnabled': False, 'emiEnabled': True, 'chatEnabled': False, 'measurementModeEnabled': False, 'sampleModeEnabled': False, 'disableBuyButton': False}, 'earlyBirdOffer': None, 'serviceability': {'launchDate': '', 'returnPeriod': 30, 'descriptors': ['Pay on delivery might be available', 'Easy 30 days returns and exchanges', 'Try & Buy might be available'], 'procurementTimeInDays': {'6206': 4}}, 'buyButtonSellerOrder': [{'skuId': 38724440, 'sellerPartnerId': 6206}, {'skuId': 38724442, 'sellerPartnerId': 6206}, {'skuId': 38724446, 'sellerPartnerId': 6206}, {'skuId': 38724450, 'sellerPartnerId': 6206}, {'skuId': 38724452, 'sellerPartnerId': 6206}, {'skuId': 38724444, 'sellerPartnerId': 6206}, {'skuId': 38724448, 'sellerPartnerId': 6206}], 'sellers': [{'sellerPartnerId': 6206, 'sellerName': 'Puma Sports India Pvt. Ltd.(NSCM)'}], 'sizes': [{'skuId': 38724440, 'styleId': 11203218, 'action': '/product/11203218/related/6?co=1', 'label': '6', 'available': True, 'sizeType': 'UK Size', 'originalStyle': True, 'measurements': [{'type': 'Body Measurement', 'name': 'To Fit Foot Length', 'value': '24.5', 'minValue': '24.5', 'maxValue': '24.5', 'unit': 'cm', 'displayText': '24.5cm'}], 'allSizesList': [{'scaleCode': 'uk_size', 'sizeValue': '6', 'size': 'UK Size', 'order': 1, 'prefix': 'UK'}, {'scaleCode': 'us_size', 'sizeValue': '7', 'size': 'US Size', 'order': 2, 'prefix': 'US'}, {'scaleCode': 'euro_size', 'sizeValue': '39', 'size': 'Euro Size', 'order': 3, 'prefix': 'EURO'}], 'sizeSellerData': [{'mrp': 6499, 'sellerPartnerId': 6206, 'availableCount': 32, 'sellableInventoryCount': 32, 'warehouses': ['106', '328'], 'supplyType': 'ON_HAND', 'discountId': '11203218:23363948', 'discountedPrice': 2924}]}, {'skuId': 38724442, 'styleId': 11203218, 'action': '/product/11203218/related/7?co=1', 'label': '7', 'available': True, 'sizeType': 'UK Size', 'originalStyle': True, 'measurements': [{'type': 'Body Measurement', 'name': 'To Fit Foot Length', 'value': '25.4', 'minValue': '25.4', 'maxValue': '25.4', 'unit': 'cm', 'displayText': '25.4cm'}], 'allSizesList': [{'scaleCode': 'uk_size', 'sizeValue': '7', 'size': 'UK Size', 'order': 1, 'prefix': 'UK'}, {'scaleCode': 'us_size', 'sizeValue': '8', 'size': 'US Size', 'order': 2, 'prefix': 'US'}, {'scaleCode': 'euro_size', 'sizeValue': '40.5', 'size': 'Euro Size', 'order': 3, 'prefix': 'EURO'}], 'sizeSellerData': [{'mrp': 6499, 'sellerPartnerId': 6206, 'availableCount': 86, 'sellableInventoryCount': 86, 'warehouses': ['106'], 'supplyType': 'ON_HAND', 'discountId': '11203218:23363948', 'discountedPrice': 2924}]}, {'skuId': 38724444, 'styleId': 11203218, 'action': '/product/11203218/related/8?co=1', 'label': '8', 'available': True, 'sizeType': 'UK Size', 'originalStyle': True, 'measurements': [{'type': 'Body Measurement', 'name': 'To Fit Foot Length', 'value': '26.2', 'minValue': '26.2', 'maxValue': '26.2', 'unit': 'cm', 'displayText': '26.2cm'}], 'allSizesList': [{'scaleCode': 'uk_size', 'sizeValue': '8', 'size': 'UK Size', 'order': 1, 'prefix': 'UK'}, {'scaleCode': 'us_size', 'sizeValue': '9', 'size': 'US Size', 'order': 2, 'prefix': 'US'}, {'scaleCode': 'euro_size', 'sizeValue': '42', 'size': 'Euro Size', 'order': 3, 'prefix': 'EURO'}], 'sizeSellerData': [{'mrp': 6499, 'sellerPartnerId': 6206, 'availableCount': 188, 'sellableInventoryCount': 188, 'warehouses': ['106'], 'supplyType': 'ON_HAND', 'discountId': '11203218:23363948', 'discountedPrice': 2924}]}, {'skuId': 38724446, 'styleId': 11203218, 'action': '/product/11203218/related/9?co=1', 'label': '9', 'available': True, 'sizeType': 'UK Size', 'originalStyle': True, 'measurements': [{'type': 'Body Measurement', 'name': 'To Fit Foot Length', 'value': '27.1', 'minValue': '27.1', 'maxValue': '27.1', 'unit': 'cm', 'displayText': '27.1cm'}], 'allSizesList': [{'scaleCode': 'uk_size', 'sizeValue': '9', 'size': 'UK Size', 'order': 1, 'prefix': 'UK'}, {'scaleCode': 'us_size', 'sizeValue': '10', 'size': 'US Size', 'order': 2, 'prefix': 'US'}, {'scaleCode': 'euro_size', 'sizeValue': '43', 'size': 'Euro Size', 'order': 3, 'prefix': 'EURO'}], 'sizeSellerData': [{'mrp': 6499, 'sellerPartnerId': 6206, 'availableCount': 163, 'sellableInventoryCount': 163, 'warehouses': ['106'], 'supplyType': 'ON_HAND', 'discountId': '11203218:23363948', 'discountedPrice': 2924}]}, {'skuId': 38724448, 'styleId': 11203218, 'action': '/product/11203218/related/10?co=1', 'label': '10', 'available': True, 'sizeType': 'UK Size', 'originalStyle': True, 'measurements': [{'type': 'Body Measurement', 'name': 'To Fit Foot Length', 'value': '27.9', 'minValue': '27.9', 'maxValue': '27.9', 'unit': 'cm', 'displayText': '27.9cm'}], 'allSizesList': [{'scaleCode': 'uk_size', 'sizeValue': '10', 'size': 'UK Size', 'order': 1, 'prefix': 'UK'}, {'scaleCode': 'us_size', 'sizeValue': '11', 'size': 'US Size', 'order': 2, 'prefix': 'US'}, {'scaleCode': 'euro_size', 'sizeValue': '44.5', 'size': 'Euro Size', 'order': 3, 'prefix': 'EURO'}], 'sizeSellerData': [{'mrp': 6499, 'sellerPartnerId': 6206, 'availableCount': 153, 'sellableInventoryCount': 153, 'warehouses': ['106'], 'supplyType': 'ON_HAND', 'discountId': '11203218:23363948', 'discountedPrice': 2924}]}, {'skuId': 38724450, 'styleId': 11203218, 'action': '/product/11203218/related/11?co=1', 'label': '11', 'available': True, 'sizeType': 'UK Size', 'originalStyle': True, 'measurements': [{'type': 'Body Measurement', 'name': 'To Fit Foot Length', 'value': '28.8', 'minValue': '28.8', 'maxValue': '28.8', 'unit': 'cm', 'displayText': '28.8cm'}], 'allSizesList': [{'scaleCode': 'uk_size', 'sizeValue': '11', 'size': 'UK Size', 'order': 1, 'prefix': 'UK'}, {'scaleCode': 'us_size', 'sizeValue': '12', 'size': 'US Size', 'order': 2, 'prefix': 'US'}, {'scaleCode': 'euro_size', 'sizeValue': '46', 'size': 'Euro Size', 'order': 3, 'prefix': 'EURO'}], 'sizeSellerData': [{'mrp': 6499, 'sellerPartnerId': 6206, 'availableCount': 43, 'sellableInventoryCount': 43, 'warehouses': ['106'], 'supplyType': 'ON_HAND', 'discountId': '11203218:23363948', 'discountedPrice': 2924}]}, {'skuId': 38724452, 'styleId': 11203218, 'action': '/product/11203218/related/12?co=1', 'label': '12', 'available': False, 'sizeType': 'UK Size', 'originalStyle': True, 'measurements': [{'type': 'Body Measurement', 'name': 'To Fit Foot Length', 'value': '29.6', 'minValue': '29.6', 'maxValue': '29.6', 'unit': 'cm', 'displayText': '29.6cm'}], 'allSizesList': [{'scaleCode': 'uk_size', 'sizeValue': '12', 'size': 'UK Size', 'order': 1, 'prefix': 'UK'}, {'scaleCode': 'us_size', 'sizeValue': '13', 'size': 'US Size', 'order': 2, 'prefix': 'US'}, {'scaleCode': 'euro_size', 'sizeValue': '47', 'size': 'Euro Size', 'order': 3, 'prefix': 'EURO'}], 'sizeSellerData': []}], 'discounts': [{'type': 1, 'freeItem': False, 'label': '(55% OFF)', 'discountText': '', 'timerStart': '0', 'timerEnd': '1597084200', 'discountPercent': 55, 'offer': '', 'discountId': '11203218:23363948', 'heading': None, 'description': None, 'link': None, 'freeItemImage': None}], 'offers': [{'type': 'EMI', 'title': 'EMI option available', 'description': '', 'action': '/faqs', 'image': None}], 'bundledSkus': None, 'richPdp': None, 'landingPageUrl': 'sports-shoes/puma/puma-men-blue-hybrid-fuego-running-shoes/11203218/buy'}, 'pageName': 'Pdp', 'atsa': ['Sport', 'Material', 'Fastening', 'Ankle Height', 'Outsole Type', 'Cleats', 'Pronation for Running Shoes', 'Arch Type', 'Cushioning', 'Running Type', 'Warranty', 'Distance', 'Number of Components', 'Surface Type', 'Technology']}

Related

Filter Boto3 "client.describe_volumes" response

I'm trying to extract data from some EBS volumes using Boto3, specifically: 'Device', 'InstanceId' and 'AvailabilityZone'.
This is my code:
import boto3
AWS_REGION = "us-east-1"
client = boto3.client('ec2', region_name=AWS_REGION)
volume_id_list = ['vol-02e15c9d70exxxxx', 'vol-0bbcb1b0e98xxxxx']
for vol_id in volume_id_list:
response = client.describe_volumes(
VolumeIds=[
vol_id,
],
)
print(response)
I'm getting the following response, although the data is there I'm not being able to extract it because of the format:
{'Volumes': [{'Attachments': [{'AttachTime': datetime.datetime(2022, 11, 23, 18, 18, 9, tzinfo=tzutc()), 'Device': '/dev/xvda', 'InstanceId': 'i-xxxxxxxxxxx', 'State': 'attached', 'VolumeId': 'vol-02e15c9xxxxxxx', 'DeleteOnTermination': True}], 'AvailabilityZone': 'us-east-1a', 'CreateTime': datetime.datetime(2022, 11, 23, 18, 18, 9, 713000, tzinfo=tzutc()), 'Encrypted': False, 'Size': 8, 'SnapshotId': 'snap-xxxxxxxxxxx', 'State': 'in-use', 'VolumeId': 'vol-xxxxxxxxxxx', 'Iops': 100, 'VolumeType': 'gp2', 'MultiAttachEnabled': False}], 'ResponseMetadata': {'RequestId': 'xxxxxxx-eb66-4be9-a2c6-0xxxxxxxxx', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'xxxxxx-eb66-4be9-a2c6-xxxxxxxx', 'cache-control': 'no-cache, no-store', 'strict-transport-security': 'max-age=31536000; includeSubDomains', 'content-type': 'text/xml;charset=UTF-8', 'content-length': '1201', 'date': 'Wed, 07 Dec 2022 17:07:50 GMT', 'server': 'AmazonEC2'}, 'RetryAttempts': 0}}
Is there a way to filter the response so I can only receive 'Device', 'InstanceId' and 'AvailabilityZone'?
Thanks in advance
Sure #John Rotenstein, here is the complete answer:
This is how I got the data I needed:
import boto3
AWS_REGION = "us-east-1"
client = boto3.client('ec2', region_name=AWS_REGION)
volume_id_list = ['vol-02e15c9d70exxxxx', 'vol-0bbcb1b0e98xxxxx']
for vol_id in volume_id_list:
response = client.describe_volumes(
VolumeIds=[
vol_id,
],
)
device = response['Volumes'][0]['Attachments'][0]['Device']
instance = response['Volumes'][0]['Attachments'][0]['InstanceId']
az = response['Volumes'][0]['AvailabilityZone']
print(device)
print(instance)
print(az)

Scrapy pagination: Unable to paginate

First of all, thank you if you are reading this.
I have been using Python with scrapy to scrape minor data, however, I want to pull in some additional information but I got stuck on pagination.
The website is https://home.mobile.de/regional/baden-w%C3%BCrttemberg/0.html
The element is
<span class="jslink pg-btn page-next" data-href="https://home.mobile.de/regional/baden-württemberg/2.html" title="Zur nächsten Seite"> </span>
element
What is the xpath expression I can use in Rule(LinkExtractor(restrict_xpaths="")?
I'm using crawl template.
My code so far:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class Baden1Spider(CrawlSpider):
name = 'baden1'
allowed_domains = ['home.mobile.de']
start_urls = ['https://home.mobile.de/regional/baden-w%C3%BCrttemberg/0.html?fbclid=IwAR0MpRTx1TrrrBdg2cKr5E08QiP4fE-pjOAwb7_UsEytToJmWFEfpdD6X0w/']
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[#class='box']/div[#class='row ']"), callback='parse_item', follow=True),
# Rule(LinkExtractor(restrict_xpaths="//span[#class='jslink pg-btn page-next']"))
)
def parse_item(self, response):
yield{
'Dealer Name': response.xpath("//address[#class='fullAddress']/strong/text()").get(),
'Street': response.xpath("normalize-space(//div[contains(#class, 'addressData')]/text())").get(),
'ZIP Code': response.xpath("normalize-space(//div[contains(#class, 'addressData')]/text()/following::text()[1])").get().split()[0],
'City': response.xpath("normalize-space(//div[contains(#class, 'addressData')]/text()/following::text()[1])").get().split()[1],
'Phone Number 1': response.xpath("normalize-space(//div[contains(#class, 'dealerContactPhoneNumbers')]/text())").get(),
'Phone Number 2': response.xpath("normalize-space(//div[contains(#class, 'dealerContactPhoneNumbers')]/text()/following::text()[1])").get(),
'Source': response.url
}
N.B. This is my first post here in stackoverflow. If I made any mistake, pardon me.
Here is the pagination:
Your code is working fine. Starting url: " https://home.mobile.de/regional/baden-w%C3%BCrttemberg/0.html" is same as your mentioned. If you click on the first page then you will get this url, from where data is generating. I make the pagination in start_urls using list comprehension. Now You can increase or decrease range of page numbers at anytime. Here I scrape only five pages and you can scrape total pages or whatever you wish just put the page numbers inside the range. I scrape 5 pages total 160 items.
CODE:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class Baden1Spider(CrawlSpider):
name = 'baden1'
allowed_domains = ['home.mobile.de']
start_urls = ['https://home.mobile.de/regional/baden-w%C3%BCrttemberg/'+ str(x) +'.html' for x in range(0,5)]
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[#class='box']/div[#class='row ']"), callback='parse_item', follow=True),
# Rule(LinkExtractor(restrict_xpaths="//span[#class='jslink pg-btn page-next']"))
)
def parse_item(self, response):
yield{
'Dealer Name': response.xpath("//address[#class='fullAddress']/strong/text()").get(),
'Street': response.xpath("normalize-space(//div[contains(#class, 'addressData')]/text())").get(),
'ZIP Code': response.xpath("normalize-space(//div[contains(#class, 'addressData')]/text()/following::text()[1])").get().split()[0],
'City': response.xpath("normalize-space(//div[contains(#class, 'addressData')]/text()/following::text()[1])").get().split()[1],
'Phone Number 1': response.xpath("normalize-space(//div[contains(#class, 'dealerContactPhoneNumbers')]/text())").get(),
'Phone Number 2': response.xpath("normalize-space(//div[contains(#class, 'dealerContactPhoneNumbers')]/text()/following::text()[1])").get(),
'Source': response.url
}
OUTPUT: A portion of total output.
'Dealer Name': 'Abbas KfZ An- und Verkauf', 'Street': 'schießstattweg 18', 'ZIP Code': '88677', 'City': 'Markdorf', 'Phone Number 1': 'Tel.:\xa0+49 (0)176 56730811', 'Phone Number 2': '', 'Source': 'https://home.mobile.de/ABBASKFZANUNDVERKAUF'}
2021-08-06 12:40:14 [scrapy.core.scraper] DEBUG: Scraped from <200 https://home.mobile.de/SCHAIBLEMASCHINENHANDEL>
{'Dealer Name': 'Schaible Maschinenhandel', 'Street': 'In Oberwiesen 7', 'ZIP Code': '88682', 'City': 'Salem', 'Phone Number 1': 'Tel.:\xa0+49 (0)7553 60146', 'Phone Number 2': 'Mobiltelefon:\xa0+49 (0)171 7998515', 'Source': 'https://home.mobile.de/SCHAIBLEMASCHINENHANDEL'}
2021-08-06 12:40:14 [scrapy.core.scraper] DEBUG: Scraped from <200 https://home.mobile.de/RUSH-AUTOMOBILE>
{'Dealer Name': 'RUSH Automobile UG (haftungsbeschränkt)', 'Street': 'Hallendorferstrasse 6', 'ZIP Code': '88690', 'City': 'Uhldingen', 'Phone Number 1': 'Tel.:\xa0+49 (0)7551 949277', 'Phone Number 2': '2. Tel.-Nr.:\xa0+49 (0)171 3608800', 'Source': 'https://home.mobile.de/RUSH-AUTOMOBILE'}
2021-08-06 12:40:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://home.mobile.de/FIRST-CLASS-AUTOMOBILE> (referer: https://home.mobile.de/regional/baden-w%C3%BCrttemberg/0.html)
2021-08-06 12:40:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://home.mobile.de/AH-MUTTER> (referer: https://home.mobile.de/regional/baden-w%C3%BCrttemberg/0.html)
2021-08-06 12:40:14 [scrapy.core.scraper] DEBUG: Scraped from <200 https://home.mobile.de/LOCFAHRZEUGE>
{'Dealer Name': 'LOC Fahrzeuge OHG', 'Street': 'Meersburger Straße 2', 'ZIP Code': '88690', 'City': 'Uhldingen', 'Phone Number 1': 'Tel.:\xa0+49 (0)7556 928597', 'Phone Number 2': 'Fax:\xa0+49 (0)7556 928583', 'Source': 'https://home.mobile.de/LOCFAHRZEUGE'}
2021-08-06 12:40:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://home.mobile.de/AH-SCHMID-BERMATINGEN> (referer: https://home.mobile.de/regional/baden-w%C3%BCrttemberg/0.html)
2021-08-06 12:40:14 [scrapy.core.scraper] DEBUG: Scraped from <200 https://home.mobile.de/FIRST-CLASS-AUTOMOBILE>
{'Dealer Name': 'First Class Automobile Seit 1989', 'Street': 'Büro: Oberer Höhenweg 29', 'ZIP Code': '88697', 'City': 'Bermatingen', 'Phone Number 1': 'Tel.:\xa0+49 (0)176 20491640', 'Phone Number 2': '2. Tel.-Nr.:\xa0+49 (0)7544 91111', 'Source': 'https://home.mobile.de/FIRST-CLASS-AUTOMOBILE'}
2021-08-06 12:40:14 [scrapy.core.scraper] DEBUG: Scraped from <200 https://home.mobile.de/AH-MUTTER>
{'Dealer Name': 'Autohaus Matthias Mutter', 'Street': 'Salemerstrasse 42', 'ZIP Code': '88697', 'City': 'Bermatingen', 'Phone Number 1': 'Tel.:\xa0+49 (0)7544 912100', 'Phone Number 2': 'Fax:\xa0+49 (0)7544 91110', 'Source': 'https://home.mobile.de/AH-MUTTER'}
2021-08-06 12:40:14 [scrapy.core.scraper] DEBUG: Scraped from <200 https://home.mobile.de/AH-SCHMID-BERMATINGEN>
{'Dealer Name': 'Autohaus Schmid', 'Street': 'Salemer Straße 30', 'ZIP Code': '88697', 'City': 'Bermatingen', 'Phone Number 1': 'Tel.:\xa0+49 7544 2375', 'Phone Number 2': 'Fax:\xa0+49 7544 1355', 'Source': 'https://home.mobile.de/AH-SCHMID-BERMATINGEN'}
2021-08-06 12:40:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://home.mobile.de/YAMAHA-NESENSOHN> (referer: https://home.mobile.de/regional/baden-w%C3%BCrttemberg/0.html)
2021-08-06 12:40:14 [scrapy.core.scraper] DEBUG: Scraped from <200 https://home.mobile.de/YAMAHA-NESENSOHN>
{'Dealer Name': 'Yamaha Nesensohn', 'Street': 'Salemerstrasse 51', 'ZIP Code': '88697', 'City': 'Bermatingen', 'Phone Number 1': 'Tel.:\xa0+49 (0)7544 2902', 'Phone Number 2': 'Fax:\xa0+49 (0)7544 73025', 'Source': 'https://home.mobile.de/YAMAHA-NESENSOHN'}
2021-08-06 12:40:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://home.mobile.de/AUTOHAUS-KIRCHHOFF> (referer: https://home.mobile.de/regional/baden-w%C3%BCrttemberg/0.html)
2021-08-06 12:40:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://home.mobile.de/AUTOMOBILEREHM> (referer:
https://home.mobile.de/regional/baden-w%C3%BCrttemberg/0.html)
2021-08-06 12:40:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://home.mobile.de/AUTOHAUSSAILERGMBHCOKG> (referer: https://home.mobile.de/regional/baden-w%C3%BCrttemberg/0.html)
2021-08-06 12:40:14 [scrapy.core.scraper] DEBUG: Scraped from <200 https://home.mobile.de/AUTOHAUS-KIRCHHOFF>
{'Dealer Name': 'Autohaus Kirchhoff', 'Street': 'Am Luckengraben 4', 'ZIP Code': '88699', 'City': 'Frickingen', 'Phone Number 1': 'Tel.:\xa0+49 (0)7554 8450', 'Phone Number 2': 'Fax:\xa0+49 (0)7554 8252', 'Source': 'https://home.mobile.de/AUTOHAUS-KIRCHHOFF'}
2021-08-06 12:40:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://home.mobile.de/PATRICKKAYSERHAGNAUAMBODENSEE1> (referer: https://home.mobile.de/regional/baden-w%C3%BCrttemberg/0.html)
2021-08-06 12:40:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://home.mobile.de/AUTOHAUSREICHLEOHG> (referer: https://home.mobile.de/regional/baden-w%C3%BCrttemberg/0.html)
2021-08-06 12:40:14 [scrapy.core.scraper] DEBUG: Scraped from <200 https://home.mobile.de/AUTOMOBILEREHM>
{'Dealer Name': 'Automobile Rehm', 'Street': 'Heidbühlstr. 9', 'ZIP Code': '88697', 'City': 'Bermatingen', 'Phone Number 1': 'Tel.:\xa0+49 175 2234111', 'Phone Number 2': '', 'Source': 'https://home.mobile.de/AUTOMOBILEREHM'}
2021-08-06 12:40:14 [scrapy.core.scraper] DEBUG: Scraped from <200 https://home.mobile.de/AUTOHAUSSAILERGMBHCOKG>
{'Dealer Name': 'Autohaus Sailer GmbH & Co.KG', 'Street': 'Hofäckerstr. 1', 'ZIP Code': '88697', 'City': 'Bermatingen-Ahausen', 'Phone Number 1': 'Tel.:\xa0+49 (0)7544 968300', 'Phone Number 2': '2. Tel.-Nr.:\xa0+49 (0)7544 9683018', 'Source': 'https://home.mobile.de/AUTOHAUSSAILERGMBHCOKG'}
2021-08-06 12:40:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://home.mobile.de/LACKIERMEISTERBETRIEBKFZSERVICE> (referer: https://home.mobile.de/regional/baden-w%C3%BCrttemberg/0.html)
2021-08-06 12:40:15 [scrapy.core.scraper] DEBUG: Scraped from <200 https://home.mobile.de/PATRICKKAYSERHAGNAUAMBODENSEE1>
{'Dealer Name': 'Patrick Kayser', 'Street': 'Langbrühl 6', 'ZIP Code': '88709', 'City': 'Hagnau', 'Phone Number 1':
'Tel.:\xa0+49 (0)178 6524858', 'Phone Number 2': '2. Tel.-Nr.:\xa0+49 (0)7532 4458081', 'Source': 'https://home.mobile.de/PATRICKKAYSERHAGNAUAMBODENSEE1'}
2021-08-06 12:40:15 [scrapy.core.scraper] DEBUG: Scraped from <200 https://home.mobile.de/AUTOHAUSREICHLEOHG>
{'Dealer Name': 'Autohaus Reichle OHG', 'Street': 'Hauptstraße 57', 'ZIP Code': '88699', 'City': 'Frickingen-Altheim', 'Phone Number 1': 'Tel.:\xa0+49 7554 8337', 'Phone Number 2': 'Mobiltelefon:\xa0+49 151 65828855', 'Source': 'https://home.mobile.de/AUTOHAUSREICHLEOHG'}
2021-08-06 12:40:15 [scrapy.core.scraper] DEBUG: Scraped from <200 https://home.mobile.de/LACKIERMEISTERBETRIEBKFZSERVICE>
{'Dealer Name': 'Lackiermeisterbetrieb & KFZ Service', 'Street': 'Lippertsreuterstr. 6b', 'ZIP Code': '88699', 'City': 'Frickingen', 'Phone Number 1': 'Tel.:\xa0+49 (0)7554 9892115', 'Phone Number 2': '2. Tel.-Nr.:\xa0+49 (0)1525 2160629', 'Source': 'https://home.mobile.de/LACKIERMEISTERBETRIEBKFZSERVICE'}
2021-08-06 12:40:15 [scrapy.core.engine] INFO: Closing spider (finished)
2021-08-06 12:40:15 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 369317,
'downloader/request_count': 165,
'downloader/request_method_count/GET': 165,
'downloader/response_bytes': 2468479,
'downloader/response_count': 165,
'downloader/response_status_count/200': 165,
'elapsed_time_seconds': 17.246198,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2021, 8, 6, 6, 40, 15, 130449),
'httpcompression/response_bytes': 6481573,
'httpcompression/response_count': 165,
'item_scraped_count': 160,

Why doesn't BertForMaskedLM generate right masked tokens?

I am testing this piece of code:
from transformers import BertTokenizer, BertModel, BertForMaskedLM
tokenizer = BertTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")
model = BertForMaskedLM.from_pretrained("hfl/chinese-roberta-wwm-ext")
from transformers import pipeline
def check_model(model, tokenizer):
fill_mask = pipeline(
"fill-mask",
model=model,
tokenizer=tokenizer
)
print('Fill blank: ')
fill_mask("我喜欢 {nlp.tokenizer.mask_token}.")
print('Fill blank: ')
fill_mask("这个品牌的面膜 {nlp.tokenizer.mask_token}.")
print('Check model ...')
check_model(model, tokenizer)
But it prints out this error message:
raceback (most recent call last):
File "/Users/congminmin/nlp/embedding/transformer/bert_roberta_wwm_test.py", line 21, in <module>
check_model(model, tokenizer)
File "/Users/congminmin/nlp/embedding/transformer/bert_roberta_wwm_test.py", line 15, in check_model
fill_mask("我喜欢 {nlp.tokenizer.mask_token}.")
File "/Users/congminmin/.venv/wbkg/lib/python3.7/site-packages/transformers/pipelines/fill_mask.py", line 162, in __call__
self.ensure_exactly_one_mask_token(masked_index.numpy())
File "/Users/congminmin/.venv/wbkg/lib/python3.7/site-packages/transformers/pipelines/fill_mask.py", line 90, in ensure_exactly_one_mask_token
f"No mask_token ({self.tokenizer.mask_token}) found on the input",
transformers.pipelines.base.PipelineException: No mask_token ([MASK]) found on the input
That is a string formatting issue. Currently when you call:
"这个品牌的面膜 {nlp.tokenizer.mask_token}."
the string you create is:
'这个品牌的面膜 {nlp.tokenizer.mask_token}.'
What you actually want to do is (formated string literals):
f"我喜欢 {fill_mask.tokenizer.mask_token}."
Output:
'我喜欢 [MASK].'
Full example:
from transformers import BertTokenizer, BertModel, BertForMaskedLM
tokenizer = BertTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")
model = BertForMaskedLM.from_pretrained("hfl/chinese-roberta-wwm-ext")
from transformers import pipeline
def check_model(model, tokenizer):
fill_mask = pipeline(
"fill-mask",
model=model,
tokenizer=tokenizer
)
print('Fill blank: ')
print(fill_mask(f"我喜欢 {fill_mask.tokenizer.mask_token}."))
print('Fill blank: ')
print(fill_mask(f"这个品牌的面膜 {fill_mask.tokenizer.mask_token}."))
print('Check model ...')
check_model(model, tokenizer)
Output:
Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Check model ...
Fill blank:
[{'sequence': '我 喜 欢 他.', 'score': 0.20969171822071075, 'token': 800, 'token_str': '他'}, {'sequence': '我 喜 欢 你.', 'score': 0.2071659415960312, 'token': 872, 'token_str': '你'}, {'sequence': '我 喜 欢 她.', 'score': 0.13876770436763763, 'token': 1961, 'token_str': '她'}, {'sequence': '我 喜 欢 的.', 'score': 0.07542475312948227, 'token': 4638, 'token_str': '的'}, {'sequence': '我 喜 欢 它.', 'score': 0.05587303638458252, 'token': 2124, 'token_str': '它'}]
Fill blank:
[{'sequence': '这 个 品 牌 的 面 膜 好.', 'score': 0.15848451852798462, 'token': 1962, 'token_str': '好'}, {'sequence': '这 个 品 牌 的 面 膜..', 'score': 0.12413082271814346, 'token': 119, 'token_str': '.'}, {'sequence': '这 个 品 牌 的 面 膜 呢.', 'score': 0.09926403313875198, 'token': 1450, 'token_str': '呢'}, {'sequence': '这 个 品 牌 的 面 膜 啊.', 'score': 0.06865812838077545, 'token': 1557, 'token_str': '啊'}, {'sequence': '这 个 品 牌 的 面 膜 1.', 'score': 0.061997584998607635, 'token': 122, 'token_str': '1'}]

How to print specific list in python from a large collection of list? [closed]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 3 years ago.
Improve this question
['90', '80', '70', '60', '50', '40', '30', '20', '10']
['09', '08', '07', '06', '05', '04', '03', '02', '01']
['11', '12', '13', '14', '15', '16', '17', '18', '19']
['29', '28', '27', '26', '25', '24', '23', '22', '21']
['31', '32', '33', '34', '35', '36', '37', '38', '39']
['49', '48', '47', '46', '45', '44', '43', '42', '41']
['51', '52', '53', '54', '55', '56', '57', '58', '59']
['69', '68', '67', '66', '65', '64', '63', '62', '61']
['71', '72', '73', '74', '75', '76', '77', '78', '79']
How to print the first list only? And
How to print the second and fifth lists only?
lists = [['90', '80', '70', '60', '50', '40', '30', '20', '10'],
['09', '08', '07', '06', '05', '04', '03', '02', '01'],
['11', '12', '13', '14', '15', '16', '17', '18', '19'],
['29', '28', '27', '26', '25', '24', '23', '22', '21'],
['31', '32', '33', '34', '35', '36', '37', '38', '39'],
['49', '48', '47', '46', '45', '44', '43', '42', '41'],
['51', '52', '53', '54', '55', '56', '57', '58', '59'],
['69', '68', '67', '66', '65', '64', '63', '62', '61'],
['71', '72', '73', '74', '75', '76', '77', '78', '79']]
print(lists[0], "\n", lists[8])
It prints the first i.e [0] and the last i.e [8]
print(lists[0], "\n", lists[1], "\n", lists[4])
And this one answers your question 1st, 2nd, 5th
If what you have is a list of lists, you access the inner lists in exactly the same way you access any element in a list.
Let's say you have a variable x:
[[1, 2, 3], [4, 5], [6, 7, 8, 9]]
The first list in that variable is x[0], [1, 2, 3].
The first and third lists can be gotten (into another list of lists) with [x[0], x[2]].
By way of comparison, getting the second item from the third list in that original list of lists (the 7) can be done with x[2][1].
# somefile.txt
['90', '80', '70', '60', '50', '40', '30', '20', '10']
['09', '08', '07', '06', '05', '04', '03', '02', '01']
['11', '12', '13', '14', '15', '16', '17', '18', '19']
['29', '28', '27', '26', '25', '24', '23', '22', '21']
['31', '32', '33', '34', '35', '36', '37', '38', '39']
['49', '48', '47', '46', '45', '44', '43', '42', '41']
['51', '52', '53', '54', '55', '56', '57', '58', '59']
['69', '68', '67', '66', '65', '64', '63', '62', '61']
['71', '72', '73', '74', '75', '76', '77', '78', '71']
# main.py
list = []
with open('somefile.txt', 'r') as f:
for line in f:
list.append(line)
print("first: ", list[0], "\n", "second: ", list[1], "\n", "third: ", list[4])
To print 1st list, 2nd list and 5th list

Dask groupby unique as a frame - how to?

I have few dataframes:
import pandas as pd
import numpy as np
router = pd.DataFrame([
['2018-01-01 00:00:00', '1', 5],
['2018-01-01 00:30:00', '1', 7],
['2018-01-01 01:00:00', '1', 25],
['2018-01-01 01:30:00', '1', 3],
['2018-01-01 00:00:00', '2', 25],
['2018-01-01 00:30:00', '2', 7],
['2018-01-01 01:00:00', '2', 25],
['2018-01-01 01:30:00', '2', 35],
], columns=['time', 'cust_id', 'errors'])
router
devices = pd.DataFrame([
['2018-01-01 00:00:00', '1', 'dev_1'],
['2018-01-01 00:30:00', '1', 'dev_1'],
['2018-01-01 00:30:00', '1', 'dev_2'],
['2018-01-01 01:00:00', '1', 'dev_1'],
['2018-01-01 01:00:00', '1', 'dev_2'],
['2018-01-01 01:00:00', '1', 'dev_3'],
['2018-01-01 01:30:00', '1', 'dev_2'],
['2018-01-01 00:00:00', '2', 'dev_1'],
['2018-01-01 00:00:00', '2', 'dev_2'],
['2018-01-01 00:30:00', '2', 'dev_1'],
['2018-01-01 01:00:00', '2', 'dev_2'],
['2018-01-01 01:00:00', '2', 'dev_3'],
['2018-01-01 01:30:00', '2', 'dev_2'],
['2018-01-01 01:30:00', '2', 'dev_4'],
], columns=['time', 'cust_id', 'device_id'])
devices
By using pandas, I can group by and calculate unique devices:
devices_per_time = devices.groupby(['cust_id', 'time'])['device_id'].unique().to_frame()
devices_per_time
I tried to do the same with dask:
I have the following questions:
how come that I cannot use devices.groupby(['cust_id', 'time'])['device_id'].unique()?
I manage to get the result, but I am not sure whether it is the optimal one. Can someone confirm that I am using dask in the proper way?
Regards.
You cannot do .unique() because it is still not implemented yet for dask series. Check the available functions: SeriesGroupby
Here's another way of getting the result using parallel apply and set:
(devices
.groupby(['time','cust_id'])['device_id']
.apply(set, meta=object)
.apply(list,meta=object)
.compute()
.reset_index())
If you don't care about the final type (set or list), you can remove the .apply(list,meta=object)

Resources