i'm using elasticsearch to save scrapy Data but
when i run my code i get this error:
raise SerializationError(data, e)
elasticsearch.exceptions.SerializationError: ({'real_estate_ID': [],
but it function with the other items i get a problem only with the item : real_estate_ID
from __future__ import absolute_import
import scrapy
from adds.items import AddsItem
import stomp
from elasticsearch import Elasticsearch
from elasticsearch import Elasticsearch, helpers
class addsSpider(scrapy.Spider):
name = "adds"
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
}
}
allowed_domains = ["www.seloger.com"]
start_urls = ['https://www.seloger.com/list.htm?
tri=initial&idtypebien=2,1&idtt=2,5&naturebien=1,2,4&ci=750115']
es = Elasticsearch('localhost:9200',use_ssl=False,verify_certs=True)
def parse(self, response):
es = Elasticsearch()
es.indices.create(index="first_index", ignore=400)
conn = stomp.Connection()
conn.start()
conn.connect('admin', 'password', wait=True)
items = AddsItem()
items['real_estate_ID'] = response.xpath('//div[#class="c-pa-list c-pa-
sl c-pa-gold cartouche "]//#id').extract()
items['real_estate_URL'] = response.xpath('//a[#class="c-pa-link
link_AB"]//#href').extract()
items['real_estate_sale_price'] = response.xpath('//div[#class="h-fi-
pulse annonce__detail__sauvegarde"]//#data-prix').extract()
items['real_estate_category'] = response.xpath('//a[#class="c-pa-link
link_AB"]//#title').extract()
for item in items['real_estate_URL']:
conn.send(body=item, destination='/queue/scrapy.seloger.ads.queue',
persistent='false')
yield items
nextpageurl = response.xpath('//a[#class="pagination-next"]/#href')
if nextpageurl:
# If we've found a pattern which matches
path = nextpageurl.extract_first()
nextpage = response.urljoin(path)
print("Found url: {}".format(nextpage)) # Write a debug statement
yield scrapy.Request(nextpage, callback=self.parse)
es.index(index="urls", doc_type="Ads_url", id=1, body=items)
res = es.get(index="urls", doc_type="Ads_url", id=1)
Related
the problem itself is that the last 4 lines cannot be executed because there is an infinite loop in the function above, without which it is impossible, who can offer a solution?
from aiogram import Bot, types
from aiogram.dispatcher import Dispatcher
from aiogram.utils import executor
from requests import Request, Session
import requests
import json
import pprint
import time
bot = Bot(token='')
dp = Dispatcher(bot)
site = "https://pro-api.coinmarketcap.com/v2/cryptocurrency/quotes/latest"
parameters = {
'slug':'bitcoin',
'convert':'USD'
}
headers = {
'Accepts':'application/json',
'X-CMC_PRO_API_KEY':''
}
def main():
bitcoin_history = []
while True:
session = Session()
session.headers.update(headers)
response = session.get(site, params=parameters)
price = (json.loads(response.text)['data']['1']['quote']['USD']['price'])
bitcoin_history.append(price)
print(bitcoin_history)
if len(bitcoin_history) == 5:
bitcoin_history = []
time.sleep(30)
main()
#dp.message_handler(commands=["btcusd"])
async def echo_send(message : types.Message):
await message.answer("$" + str())
executor.start_polling(dp, skip_updates=True)
I am using python3.8.5, scrapy2.4.0 I am also using scrapy-proxy-pool and scrapy-user-agents I am getting "AttributeError: Response content isn't text". I am running this code on python3-venv. Would you like to help me explaining and solving the problem ?
Here is my code:
import scrapy
import json
class BasisMembersSpider(scrapy.Spider):
name = 'basis'
allowed_domains = ['www.basis.org.bd']
def start_requests(self):
start_url = 'https://basis.org.bd/get-member-list?page=1&team='
yield scrapy.Request(url=start_url, callback=self.get_membership_no)
def get_membership_no(self, response):
data_array = json.loads(response.body)['data']
next_page = json.loads(response.body)['links']['next']
for data in data_array:
next_url = 'https://basis.org.bd/get-company-profile/{0}'.format(data['membership_no'])
yield scrapy.Request(url=next_url, callback=self.parse)
if next_page:
yield scrapy.Request(url=next_page, callback=self.get_membership_no)
def parse(self, response):
print("Printing informations....................................................")
Here is my settings.py file:
BOT_NAME = 'web_scraping'
SPIDER_MODULES = ['web_scraping.spiders']
NEWSPIDER_MODULE = 'web_scraping.spiders'
AUTOTHROTTLE_ENABLED = True
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'web_scraping (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
PROXY_POOL_ENABLED = True
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 800,
'scrapy_proxy_pool.middlewares.ProxyPoolMiddleware': 610,
'scrapy_proxy_pool.middlewares.BanDetectionMiddleware': 620,
}
And are error messages from console output:
Thank you...
I am trying to read data from a kafka topiv which has been serialized using google's protobuf.
I compiled the proto files using protoc which generated pb2 files.
Now i am trying to use faust and create a stream processor but i can't find the correct way to use the pb2 files as key_serializer and value_serializer.
Here is what i have tried:
import faust
from proto.topic_pb2 import topic
app = faust.App(
'faust-consumer',
broker='kafka://',
store="memory://",
cache="memory://",
)
schema = faust.Schema(
## key_type=topic.PK,
## value_type=topic,
key_serializer=topic.PK,
value_serializer=topic,
)
topic = app.topic(
'topic',
schema=schema
)
#app.agent(topic)
async def consume(topic):
async for event in topic:
print(event)
if __name__ == "__main__":
app.main()
Does anybody have any idea how to used the pb2 in the serializers?
Man, I was trying to do the same the past week. After struggling I finally got something working - not the best way - but it works well enough.
So initially I used this python compiler: https://github.com/danielgtaylor/python-betterproto to generate the *.py files with dataclasses / type hinting.
Then, I was able to create Faust.Record classes dynamically by using a helper:
import abc
import inspect
from typing import Type
import betterproto
import faust
GENERATED_SUFFIX = "__FaustRecord_Auto"
def _import_relative_class(module: str, klass_name: str):
resolved_import = __import__(module, fromlist=[klass_name])
klass = getattr(resolved_import, klass_name)
return klass
def _is_record(attype: Type):
return (
inspect.isclass(attype)
and isinstance(attype, betterproto.Message)
or isinstance(attype, abc.ABCMeta)
)
def _build_record_annotations(klass: Type):
annotations = {}
for atname, attype in klass.__annotations__.items():
if _is_record(attype):
annotations[atname] = make_faust_record(attype)
elif isinstance(attype, str):
subklass = _import_relative_class(klass.__module__, attype)
annotations[atname] = make_faust_record(subklass)
else:
annotations[atname] = attype
return annotations
def make_faust_record(klass: Type):
type_name = f"{klass.__name__}{GENERATED_SUFFIX}"
record_type = type(type_name, (faust.Record, klass), {})
record_type.__annotations__ = _build_record_annotations(klass)
record_type._init_subclass()
return record_type
Now you can use it like:
import faust
from proto.your_models import YourModel # Import your generated proto here
from faust_converter import make_faust_record
app = faust.App(
'faust-consumer',
broker='kafka://',
store="memory://",
cache="memory://",
)
model_record = make_faust_record(YourModel)
topic = app.topic(
'topic',
value_type=model_record
)
#app.agent(topic)
async def consume(topic):
async for event in topic:
print(event)
if __name__ == "__main__":
app.main()
I was also experimenting with using Protobuf with Faust.
Mentioned below is the solution using Faust Serialiser Codecs.
faust-protobuf https://github.com/hemantkashniyal/faust-protobuf
proto_serializer.py
from faust.serializers import codecs
from typing import Any
from google.protobuf import json_format
from google.protobuf.json_format import MessageToJson
from google.protobuf.json_format import MessageToDict
from google.protobuf import text_format
from google.protobuf.text_format import MessageToString
from google.protobuf.text_format import MessageToBytes
class ProtobufSerializer(codecs.Codec):
def __init__(self, pb_type: Any):
self.pb_type = pb_type
super(self.__class__, self).__init__()
def _dumps(self, pb: Any) -> bytes:
return pb.SerializeToString()
def _loads(self, s: bytes) -> Any:
pb = self.pb_type()
pb.ParseFromString(s)
return pb
app.py
import faust
from google.protobuf.json_format import MessageToJson
from .proto.greetings_pb2 import Greeting
from .proto_serializer import ProtobufSerializer
app = faust.App(
'faust-consumer',
broker='kafka://', # TODO: update kafka endpoint
store="memory://",
cache="memory://",
)
greetings_schema = faust.Schema(
key_serializer=ProtobufSerializer(pb_type=Greeting),
value_serializer=ProtobufSerializer(pb_type=Greeting),
)
topic = app.topic(
'greetings',
schema=greetings_schema
)
#app.agent(topic)
async def consume(topic):
async for event in topic:
print(MessageToJson(event))
#app.timer(5)
async def produce():
for i in range(10):
data = Greeting(hello="world", message=i)
await consume.send(value=data)
if __name__ == "__main__":
app.main()
I was able to do it by creating a Serializer class as so:
import faust
from abc import ABCMeta, abstractmethod
from google.protobuf.json_format import MessageToDict
from faust.serializers.codecs import Codec
from importlib import import_module
def get_proto(topic_name, only_pk=False):
if not hasattr(get_proto, "topics"):
setattr(get_proto, "topics", dict())
get_proto.topics[topic_name] = import_module(
"protodef.{}_pb2".format(topic_name)
).__getattribute__(topic_name.split(".")[-1])
if only_pk:
return getattr(get_proto, "topics").get(topic_name).PK
else:
return getattr(get_proto, "topics").get(topic_name)
class ProtoSerializer(Codec, metaclass=ABCMeta):
#abstractmethod
def only_key(self):
...
def as_proto(self, topic_name):
self._proto = get_proto(topic_name, self.only_key())
return self
def _loads(self, b):
data = MessageToDict(
self._proto.FromString(b),
preserving_proto_field_name=True,
including_default_value_fields=True,
)
# remove the key object from the unserialized message
data.pop("key", None)
return data
def _dumps(self, o):
# for deletes
if not o:
return None
obj = self._proto()
# add the key object to them message before serializing
if hasattr(obj, "PK"):
for k in obj.PK.DESCRIPTOR.fields_by_name.keys():
if k not in o:
raise Exception(
"Invalid object `{}` for proto `{}`".format(o, self._proto)
)
setattr(obj.key, k, o[k])
for k, v in o.items():
if hasattr(obj, k):
setattr(obj, k, v)
else:
ghost.debug(
"Invalid value-attribute `%s` for proto `%s`", k, self._proto
)
return obj.SerializeToString()
class ProtoValue(ProtoSerializer):
def only_key(self):
return False
class ProtoKey(ProtoSerializer):
def only_key(self):
return True
and then use it as follows:
import faust
from utils.serializer import ProtoKey, ProtoValue
app = faust.App(
'faust-consumer',
broker='kafka://',
store="memory://",
cache="memory://",
)
topic = app.topic(
'topic',
key_serializer=ProtoKey().as_proto('topic'),
value_serializer=ProtoValue().as_proto('topic')
)
#app.agent(topic)
async def consume(topic):
async for event in topic:
print(event)
if __name__ == "__main__":
app.main()
I am creating a API endpoints using flask-sqlalchemy and marshmallow in python. For example I have two collections/tables one is items and other is stores. Items will have attributes like item_id, item_name, item_price and available stores_list. Store will have attributes like store_id, store_name, store_location and available items_list. I require the following JSON response when i request list of items.
[
{
item_id:1,
item_name:"Laptop",
item_price:"20",
store_list:[
{
store_id:1,
store_name:"ABC",
store_location:"USA"
},
{
store_id:2,
store_name:"BBC",
store_location:"USA"
},
{
store_id:3,
store_name:"CBC",
store_location:"USA"
}
]
},
{
item_id:2,
item_name:"Laptop",
item_price:"20",
store_list:[
{
store_id:1,
store_name:"ABC",
store_location:"USA"
},
{
store_id:2,
store_name:"BBC",
store_location:"USA"
},
{
store_id:3,
store_name:"CBC",
store_location:"USA"
}
]
}
......... and so on
]
I require the following JSON response when i request list of stores.
[
{
store_id:1,
store_name:"ABC",
store_location:"USA",
items_list:[
{
items_id:1,
items_name:"Laptop",
items_price:"65"
},
{
items_id:2,
items_name:"Keyboard",
items_price:"56"
},
{
items_id:3,
items_name:"Mouse",
items_price:"56"
}
]
},
{
store_id:2,
store_name:"BBC",
store_location:"UK",
items_list:[
{
items_id:1,
items_name:"Laptop",
items_price:"23"
},
{
items_id:2,
items_name:"BBC",
items_price:"Speaker"
},
{
items_id:3,
items_name:"Mouse",
items_price:"24"
}
]
}
......... and so on
]
So far I have tried the following
#ITEMS MODEL
from requests import Response
from flask import request, url_for
from datetime import datetime
from typing import List
from db import db
from models.store import Stores
#Bartiny Ingredients Generic Types Model
class Items(db.Model):
__tablename__ = "items"
item_id = db.Column(db.Integer, primary_key=True)
item_name = db.Column(db.String(100), nullable=False,)
item_price = db.Column(db.String(10), nullable=False,)
store_lsit = db.relationship('Stores', backref=db.backref('items'))
#classmethod
def find_by_name(cls, name: str) -> "Items":
return cls.query.filter_by(gen_type_name=name).first()
#classmethod
def find_by_id(cls, _id: int) -> "Items":
return cls.query.filter_by(id=_id).first()
#classmethod
def find_all(cls) -> List["Items"]:
return cls.query.all()
def save_to_db(self) -> None:
db.session.add(self)
db.session.commit()
def delete_from_db(self) -> None:
db.session.delete(self)
db.session.commit()
STORE MODEL
from requests import Response
from flask import request, url_for
from datetime import datetime
from typing import List
from db import db
from models.items import Items
#Bartiny Ingredients Generic Types Model
class Stores(db.Model):
__tablename__ = "stores"
store_id = db.Column(db.Integer, primary_key=True)
store_name = db.Column(db.String(100), nullable=False,)
store_locations = db.Column(db.String(10), nullable=False,)
items_list = db.relationship('Items', backref=db.backref('stores'))
#classmethod
def find_by_name(cls, name: str) -> "Stores":
return cls.query.filter_by(gen_type_name=name).first()
#classmethod
def find_by_id(cls, _id: int) -> "Stores":
return cls.query.filter_by(id=_id).first()
#classmethod
def find_all(cls) -> List["Stores"]:
return cls.query.all()
def save_to_db(self) -> None:
db.session.add(self)
db.session.commit()
def delete_from_db(self) -> None:
db.session.delete(self)
db.session.commit()
SCHEMAS
# Items Schema
from ma import ma
from marshmallow import pre_dump
from models.item import Items
class ItemsSchema(ma.ModelSchema):
class Meta:
model = Items
# Store Schema
from ma import ma
from marshmallow import pre_dump
from models.store import Stores
class StoresSchema(ma.ModelSchema):
class Meta:
model = Stores
Resources
# Store Resource
from flask_restful import Resource
from models.store import Stores
from schemas.store import StoresSchema
store_list_schema = StoreSchema(many=True)
class StoreList(Resource):
#classmethod
def get(cls):
return {"stores": store_list_schema.dump(Stores.find_all())}, 200
# Items Resource
from flask_restful import Resource
from models.item import Items
from schemas.item import ItemsSchema
item_list_schema = ItemsSchema(many=True)
class StoreList(Resource):
#classmethod
def get(cls):
return {"items": item_list_schema.dump(Items.find_all())}, 200
The following is the code for app starting
from flask import Flask, jsonify
from flask_restful import Api
from marshmallow import ValidationError
from db import db
from ma import ma
from resources.item import Item, ItemList
from resources.store import Store, StoreList
app = Flask(__name__)
app.config["SQLALCHEMY_DATABASE_URI"] = "sqlite:///data.db"
app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False
api = Api(app)
#app.before_first_request
def create_tables():
db.create_all()
#app.errorhandler(ValidationError)
def handle_marshmallow_validation(err):
return jsonify(err.messages), 400
jwt = JWTManager(app)
api.add_resource(StoreList, "/stores")
api.add_resource(ItemList, "/items")
if __name__ == "__main__":
db.init_app(app)
ma.init_app(app)
app.run(port=5000, debug=True)
Looks like jsonify is your friend...
https://www.google.com/search?q=flask+sqlalchemy+to+json
I'm new to Scrapy Python and I would like to know the best way to group prices and sellers.
My code:
import scrapy
import re
class ProductSpider(scrapy.Spider):
name = 'product'
start_urls = ['https://www.google.nl/shopping/product/13481271230046931540?client=opera&biw=1880&bih=1008&output=search&q=738678181690&oq=738678181690&prds=hsec:online,paur:ClkAsKraX-dNyyNw1dQNKCoMJnN5PTCcIkKQRK_FTu38WNkh-ASATIAsHY2pgV1a1wnYh_rnqfve8FuuCsHc8boLnMjv9EO2Q4wJS_AvrOL1pcn-GYMYHecz7BIZAFPVH73OGQwGCep7aILTJXavWpXt0Ij80g&sa=X&ved=0ahUKEwjmtKKLp5HaAhWHchQKHbO5Dg8Q2SsIFQ']
all_seller = []
def parse(self, response):
self.log('Bla Bla Bla:' + response.url)
for product in response.css(".os-main-table"):
item ={
"all_sellers": product.css(".os-seller-name-primary > a::text").extract(),
"all_prices": product.css("td.os-total-col::text").re("\d+\,\d{1,2}"),
}
for item in zip(all_prices,all_sellers):
scrapped_info = {
'all_sellers' : item[0],
'all_prices': item[1],
}
yield scrapped_info
next_page_url = response.css('.pag-prev-next-links > a:last-child::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
Solution:
import scrapy
import re
class ProductSpider(scrapy.Spider):
name = 'product'
start_urls = ['https://www.google.nl/shopping/product/13481271230046931540?client=opera&biw=1880&bih=1008&output=search&q=738678181690&oq=738678181690&prds=hsec:online,paur:ClkAsKraX-dNyyNw1dQNKCoMJnN5PTCcIkKQRK_FTu38WNkh-ASATIAsHY2pgV1a1wnYh_rnqfve8FuuCsHc8boLnMjv9EO2Q4wJS_AvrOL1pcn-GYMYHecz7BIZAFPVH73OGQwGCep7aILTJXavWpXt0Ij80g&sa=X&ved=0ahUKEwjmtKKLp5HaAhWHchQKHbO5Dg8Q2SsIFQ']
def parse(self, response):
all_sellers = response.css(".os-seller-name-primary > a::text").extract()
all_prices = response.css("td.os-total-col::text").re("\d+\,\d{1,2}")
for item in zip(all_prices,all_sellers):
scrapped_info = {
'all_sellers' : item[0],
'all_prices': item[1],
}
yield scrapped_info
next_page_url = response.css('.pag-prev-next-links > a:last-child::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)