Scrapy pipeline mysql connection module error - python-3.x

I am unable to run scrapy through my pipeline to my local database. I have already installed mysql-connector-python 8.0.19 and am able to write data to the database within the same project but outside of a Scrapy pipeline . Can someone please help i can't figure out why it isn't working.
When i try to send data via scrapy pipeline i get the following error:
[twisted] CRITICAL: Unhandled error in Deferred:
File "C:\Users\Viking\PycharmProjects\Indigo_Scrp\IndgoScrp\IndgoScrp\pipelines.py", line 7, in <module>
from mysql.connector import (connection)
ModuleNotFoundError: No module named 'mysql
Here is my code for the pipeline :
from mysql.connector import (connection)
from mysql.connector import errorcode
class IndgoscrpPipeline(object):
def __init__(self):
self.create_connection()
self.create_table()
def create_connection(self):
self.conn = connection.MySQLConnection(
host='127.0.0.1',
user='root',
passwd='',
database='Python'
)
self.curr = self.conn.cursor()
def open_spider(self, spider):
print("spider open")
def process_item(self, item, spider):
print("Saving item into db ...")
self.save(dict(item))
return item
def close_spider(self, spider):
self.mysql_close()
##########################################################################
def mysql_connect(self):
try:
return self.curr.connect(**self.conf)
except self.curr.Error as err:
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
print("Something is wrong with your user name or password")
elif err.errno == errorcode.ER_BAD_DB_ERROR:
print("Database does not exist")
else:
print(err)
#########################################
def create_table(self):
self.curr.execute(""" DROP TABLE IF EXISTS indigo""")
self.curr.execute(""" Create table indigo(
Product_Name text,
Product_Author text,
Product_Price text,
Product_Image text
)""")
def process_item(self, item, spider):
self.store_db(item)
def store_db(self, item):
self.curr.execute("""Insert Into indigo values (%s,%s,%s,%s)""",
(item['Product_Name'][0],
item['Product_Author'][0],
item['Product_Price'][0],
item['Product_Image'][0],
)
)
self.conn.commit()
return item
self.conn.close()
*
Here is my code from my spider
import scrapy
from ..items import IndScrItem
class IndgoSpider(scrapy.Spider):
name = 'Indgo'
start_urls = ['https://www.chapters.indigo.ca/en-ca/books/?link-usage=Header%3A%20books&mc=Book&lu=Main']
def parse(self, response):
items = IndScrItem()
Product_Name= response.css('.product-list__product-title-link--grid::text').getall(),
Product_Author= response.css('.product-list__contributor::text').getall(),
Product_Price= response.css('.product-list__price--orange::text').getall(),
Product_Image= response.css('.product-image--lazy::attr(src)').getall()
items['Product_Name'] = Product_Name
items['Product_Author'] = Product_Author
items['Product_Price'] = Product_Price
items['Product_Image'] = Product_Image
yield items
This is the line in the settings file that i have to enable pipelines
ITEM_PIPELINES = {
'IndgoScrp.pipelines.IndgoscrpPipeline': 100,
}

I actually found the issue was tied to having previously pip installed the wrong version of mysql-connector even though through my ide pycharm i had installed the correct one python was confused. After uninstalling both and reinstalling mysql-connector-python it was able to run.

Related

How to publish to Pub/Sub from Dataflow in batch (efficiently)?

I want to publish messages to a Pub/Sub topic with some attributes thanks to Dataflow Job in batch mode.
My dataflow pipeline is write with python 3.8 and apache-beam 2.27.0
It works with the #Ankur solution here : https://stackoverflow.com/a/55824287/9455637
But I think it could be more efficient with a shared Pub/Sub Client : https://stackoverflow.com/a/55833997/9455637
However an error occurred:
return StockUnpickler.find_class(self, module, name) AttributeError:
Can't get attribute 'PublishFn' on <module 'dataflow_worker.start'
from
'/usr/local/lib/python3.8/site-packages/dataflow_worker/start.py'>
Questions:
Would the shared publisher implementation improve beam pipeline performance?
Is there another way to avoid pickling error on my shared publisher client ?
My Dataflow Pipeline :
import apache_beam as beam
from apache_beam.io.gcp import bigquery
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
from google.cloud.pubsub_v1 import PublisherClient
import json
import argparse
import re
import logging
class PubsubClient(PublisherClient):
def __reduce__(self):
return self.__class__, (self.batch_settings,)
# The DoFn to perform on each element in the input PCollection.
class PublishFn(beam.DoFn):
def __init__(self):
from google.cloud import pubsub_v1
batch_settings = pubsub_v1.types.BatchSettings(
max_bytes=1024, # One kilobyte
max_latency=1, # One second
)
self.publisher = PubsubClient(batch_settings)
super().__init__()
def process(self, element, **kwargs):
future = self.publisher.publish(
topic=element["topic"],
data=json.dumps(element["data"]).encode("utf-8"),
**element["attributes"],
)
return future.result()
def run(argv=None, save_main_session=True):
"""Main entry point; defines and runs the pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--source_table_id",
dest="source_table_id",
default="",
help="BigQuery source table <project>.<dataset>.<table> with columns (topic, attributes, data)",
)
known_args, pipeline_args = parser.parse_known_args(argv)
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options = PipelineOptions(pipeline_args)
# pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
bq_source_table = known_args.source_table_id
bq_table_regex = r"^(?P<PROJECT_ID>[a-zA-Z0-9_-]*)[\.|\:](?P<DATASET_ID>[a-zA-Z0-9_]*)\.(?P<TABLE_ID>[a-zA-Z0-9_-]*)$"
regex_match = re.search(bq_table_regex, bq_source_table)
if not regex_match:
raise ValueError(
f"Bad BigQuery table id : `{bq_source_table}` please match {bq_table_regex}"
)
table_ref = bigquery.TableReference(
projectId=regex_match.group("PROJECT_ID"),
datasetId=regex_match.group("DATASET_ID"),
tableId=regex_match.group("TABLE_ID"),
)
with beam.Pipeline(options=pipeline_options) as p:
(
p
| "ReadFromBqTable" #
>> bigquery.ReadFromBigQuery(table=table_ref, use_json_exports=True) # Each row contains : topic / attributes / data
| "PublishRowsToPubSub" >> beam.ParDo(PublishFn())
)
if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)
run()
After fussing with this a bit, I think I have an answer that works consistently and is, if not world-beatingly performant, at least tolerably usable:
import logging
import apache_beam as beam
from apache_beam.io.gcp.pubsub import PubsubMessage
from google.cloud.pubsub_v1 import PublisherClient
from google.cloud.pubsub_v1.types import (
BatchSettings,
LimitExceededBehavior,
PublishFlowControl,
PublisherOptions,
)
class PublishClient(PublisherClient):
"""
You have to override __reduce__ to make PublisherClient pickleable 😡 😤 🤬
Props to 'Ankur' and 'Benjamin' on SO for figuring this part out; god knows
I would not have...
"""
def __reduce__(self):
return self.__class__, (self.batch_settings, self.publisher_options)
class PubsubWriter(beam.DoFn):
"""
beam.io.gcp.pubsub does not yet support batch operations, so
we do this the hard way. it's not as performant as the native
pubsubio but it does the job.
"""
def __init__(self, topic: str):
self.topic = topic
self.window = beam.window.GlobalWindow()
self.count = 0
def setup(self):
batch_settings = BatchSettings(
max_bytes=1e6, # 1MB
# by default it is 10 ms, should be less than timeout used in future.result() to avoid timeout
max_latency=1,
)
publisher_options = PublisherOptions(
enable_message_ordering=False,
# better to be slow than to drop messages during a recovery...
flow_control=PublishFlowControl(limit_exceeded_behavior=LimitExceededBehavior.BLOCK),
)
self.publisher = PublishClient(batch_settings, publisher_options)
def start_bundle(self):
self.futures = []
def process(self, element: PubsubMessage, window=beam.DoFn.WindowParam):
self.window = window
self.futures.append(
self.publisher.publish(
topic=self.topic,
data=element.data,
**element.attributes,
)
)
def finish_bundle(self):
"""Iterate over the list of async publish results and block
until all of them have either succeeded or timed out. Yield
a WindowedValue of the success/fail counts."""
results = []
self.count = self.count + len(self.futures)
for fut in self.futures:
try:
# future.result() blocks until success or timeout;
# we've set a max_latency of 60s upstairs in BatchSettings,
# so we should never spend much time waiting here.
results.append(fut.result(timeout=60))
except Exception as ex:
results.append(ex)
res_count = {"success": 0}
for res in results:
if isinstance(res, str):
res_count["success"] += 1
else:
# if it's not a string, it's an exception
msg = str(res)
if msg not in res_count:
res_count[msg] = 1
else:
res_count[msg] += 1
logging.info(f"Pubsub publish results: {res_count}")
yield beam.utils.windowed_value.WindowedValue(
value=res_count,
timestamp=0,
windows=[self.window],
)
def teardown(self):
logging.info(f"Published {self.count} messages")
The trick is that if you call future.result() inside the process() method, you will block until that single message is successfully published, so instead collect a list of futures and then at the end of the bundle make sure they're all either published or definitively timed out. Some quick testing with one of our internal pipelines suggested that this approach can publish 1.6M messages in ~200s.

How to use the value obtained in POST method outside the function

Below is my directory structure:
/home
|___ /sub_directory
|__abc.py
|__ xyz.py
Below is my xyz.py code:
from flask import Flask, request, redirect, url_for,send_from_directory, jsonify, render_template
import mysql.connector
from mysql.connector import Error
app = Flask(__name__)
try:
connection = mysql.connector.connect(host='127.0.0.1',database='test',user='root',password='')
if connection.is_connected():
db_Info = connection.get_server_info()
cursor = connection.cursor()
cursor.execute("select id,name from skill_category;")
record = cursor.fetchall()
out = [item for t in record for item in t]
except Error as e:
print("Error while connecting to MySQL",e)
#app.route('/', methods=['GET'])
def dropdown():
val = record
return render_template('Rankcv.html', val = val)
#app.route('/get-subskills', methods=['POST'])
def get_subskills():
skills = request.form['select_skills']
cursor.execute("SELECT skill_items.name FROM skill_items WHERE skill_items.category_id = " + skills + " ;")
record = cursor.fetchall()
out = [item for t in record for item in t]
...........
...........
...........
return jsonify(something)
if __name__ == "__main__":
app.run(debug=True)
Now I have to use the value of variable out and skills in abc.py.
I tried importing xyz directly and tried to retrieve the value using function name (get_subskills), but it didnt work. Can someone please explain how to solve this?
Import the abc function into xyz.

How to customize ItemExporter to not override output file on Scrapy

Im writing a simple web crawler with scrapy that capture data from two different websites. You can find all my files here.
Basically I have a main.py file:
#!/usr/bin/env python
import scrapy
from app.spiders.spider_maquinas import VultrSpider, DigitalOceanSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
class Main():
def __init__(self):
settings = get_project_settings()
process = CrawlerProcess(settings)
process.crawl(VultrSpider())
process.crawl(DigitalOceanSpider())
process.start()
if __name__ == '__main__':
main = Main()
And this pipelines for my two spiders:
import json
from scrapy.exporters import JsonItemExporter, CsvItemExporter
class CustomJsonExporter(JsonItemExporter):
def _beautify_newline(self):
self.file.write(b'\n')
class PrintItem:
def process_item(self, item, spider):
print(dict(item))
return item
class JsonPipeline:
def open_spider(self, spider):
self.file = open('static/maquinas.json', 'wb')
self.exporter = CustomJsonExporter(self.file)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
class CsvPipeline:
def open_spider(self, spider):
self.file = open('static/maquinas.csv', 'wb')
self.exporter = CsvItemExporter(self.file)
self.exporter.fields_to_export = ['storage', 'cpu', 'memory', 'bandwidth', 'price']
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
But the second spider override the first values. I need to write new Pipelines changing this manually to now override the data or there is an easy way to do this ? Thank you!

How to access the local path of a downloaded image Scrapy

here is how i am downloading Images.Now i had create one more pipline to insert my scraped data.
class CmindexPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_url']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
print("From Images Items", item)
return item
class MysqlPipline(object):
def process_item(self, item, spider):
print("From Process Items",item['image_path'])
here is my settings.py
ITEM_PIPELINES = {'cmindex.pipelines.CmindexPipeline': 1,'cmindex.pipelines.MysqlPipline':2}
IMAGES_STORE ='E:\WorkPlace\python\cmindex\cmindex\img'
IMAGES_THUMBS = {
'16X16': (16, 16)
}
But unfortunately sill i am not able to access item['image_paths'] in process_item.it raise error
KeyError: 'image_paths'
If anyone know what i am doing wrong please suggest me.
The process_item method is called before item_completed, so it does not have the image_paths yet.
If you want to access the image_paths, you will have to do it inside item_completed, or write another pipeline that is placed after the image pipeline.

pylint error message on cloud 9 CS50

I'm following the CS50 course and am having a problem with application.py. I was getting the following warnings in the Cloud 9 code editor (Ace):
instance of SQLAlchemy has no column member
instance of SQLAlchemy has no integer member
instance of SQLAlchemy has no text member
instance of scoped_session has no add member
instance of scoped_session has no commit member
Class Registrants has no query member
I created a file .pylintrc in the home directory and added the following two lines:
ignored-modules=flask_sqlalchemy
ignored-classes=SQLObject,Registrant
This got rid of most of the errors but I'm left with:
instance of scoped_session has no add member
instance of scoped_session has no commit member
Here's the code that causing the problem:
from flask import Flask, render_template, redirect, request, url_for
from flask_sqlalchemy import SQLAlchemy
app = Flask(__name__)
# Flask-SQLAlchemy
app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False
app.config["SQLALCHEMY_DATABASE_URI"] = "sqlite:///froshims3.db"
app.config["SQLALCHEMY_ECHO"] = True
db = SQLAlchemy(app)
class Registrant(db.Model):
__tablename__ = "registrants"
id = db.Column(db.Integer, primary_key=True)
name = db.Column(db.Text)
dorm = db.Column(db.Text)
def __init__(self, name, dorm):
self.name = name
self.dorm = dorm
#app.route("/")
def index():
return render_template("index.html")
#app.route("/register", methods=["POST"])
def register():
if request.form["name"] == "" or request.form["dorm"] == "":
return render_template("failure.html")
registrant = Registrant(request.form["name"], request.form["dorm"])
db.session.add(registrant)
db.session.commit()
return render_template("success.html")
#app.route("/registrants")
def registrants():
rows = Registrant.query.all()
return render_template("registrants.html", registrants=rows)
#app.route("/unregister", methods=["GET", "POST"])
def unregister():
if request.method == "GET":
rows = Registrant.query.all()
return render_template("unregister.html", registrants=rows)
elif request.method == "POST":
if request.form["id"]:
Registrant.query.filter(Registrant.id == request.form["id"]).delete()
db.session.commit()
return redirect(url_for("registrants"))
needed to add in the .pylintrc file:
ignored-classes=SQLObject,Registrant,scoped_session
apparently Python is creating some classes at run time and pylint isn't able to pick that information up.
Not really happy with this answer as it ignores the problem rather than fixes it. If anyone has a better solution please let me know. The Staff at CS50 is looking into this but no other solution yet.
Using VSCode I had to add the following item in the Python › Linting: Pylint Args settings.
--ignored-classes=SQLObject,Registrant,scoped_session

Resources