Why is my data insertion in my cassandra database so slow? - cassandra

This is my query if the current data ID is present or absent in the Cassandra database
row = session.execute("SELECT * FROM articles where id = %s", [id])
Resolved messages in Kafka, then determine whether or not this message exists in the cassandra database if it does not exist, then it should perform an insert operation, if it does exist, it should not be inserted in the data.
messages = consumer.get_messages(count=25)
if len(messages) == 0:
print 'IDLE'
sleep(1)
continue
for message in messages:
try:
message = json.loads(message.message.value)
data = message['data']
if data:
for article in data:
source = article['source']
id = article['id']
title = article['title']
thumbnail = article['thumbnail']
#url = article['url']
text = article['text']
print article['created_at'],type(article['created_at'])
created_at = parse(article['created_at'])
last_crawled = article['last_crawled']
channel = article['channel']#userid
category = article['category']
#scheduled_for = created_at.replace(minute=created_at.minute + 5, second=0, microsecond=0)
scheduled_for=(datetime.utcnow() + timedelta(minutes=5)).replace(second=0, microsecond=0)
row = session.execute("SELECT * FROM articles where id = %s", [id])
if len(list(row))==0:
#id parse base62
ids = [id[0:2],id[2:9],id[9:16]]
idstr=''
for argv in ids:
num = int(argv)
idstr=idstr+encode(num)
url='http://weibo.com/%s/%s?type=comment' % (channel,idstr)
session.execute("INSERT INTO articles(source, id, title,thumbnail, url, text, created_at, last_crawled,channel,category) VALUES (%s,%s, %s, %s, %s, %s, %s, %s, %s, %s)", (source, id, title,thumbnail, url, text, created_at, scheduled_for,channel,category))
session.execute("INSERT INTO schedules(source,type,scheduled_for,id) VALUES (%s, %s, %s,%s) USING TTL 86400", (source,'article', scheduled_for, id))
log.info('%s %s %s %s %s %s %s %s %s %s' % (source, id, title,thumbnail, url, text, created_at, scheduled_for,channel,category))
except Exception, e:
log.exception(e)
#log.info('error %s %s' % (message['url'],body))
print e
continue
Edit:
I have one ID which only has one unique table row, which I want to be like this. As soon as I add different scheduled_for times for the unique ID my system crashes. Add this if len(list(row))==0: is the right thought but my system is very slow after that.
This is my table description:
DROP TABLE IF EXISTS schedules;
CREATE TABLE schedules (
source text,
type text,
scheduled_for timestamp,
id text,
PRIMARY KEY (source, type, scheduled_for, id)
);
This scheduled_for is changeable. Here is also a concrete example
Hao article 2016-01-12 02:09:00+0800 3930462206848285
Hao article 2016-01-12 03:09:00+0801 3930462206848285
Hao article 2016-01-12 04:09:00+0802 3930462206848285
Hao article 2016-01-12 05:09:00+0803 3930462206848285
Thanks for your replies!

Why don't you use insert if not exists ?
https://docs.datastax.com/en/cql/3.1/cql/cql_reference/insert_r.html

Related

Same code different Table name gives Error! in Cassandra

working with apache Cassandra in Jupyter Notebook, creating a table, and inserting data all works fine. after I change the Table name, respectively, gives an error!
the working first code:
session.execute("""CREATE TABLE IF NOT EXISTS table2
(artist text , song text, firstname text , lastname text, userId int ,sessionid int, iteminsession int ,
PRIMARY KEY ((userId, sessionId), itemInSession)) """)
file = 'event_datafile_new.csv'
with open(file, encoding = 'utf8') as f:
csvreader = csv.reader(f)
next(csvreader) # skip header
for line in csvreader:
query = "INSERT INTO table2 (artist, song, firstname, lastname , userId, sessionId, itemInSession)"
query = query + "VALUES (%s, %s, %s, %s ,%s ,%s ,%s)"
# building the insert
# executing the insertion
session.execute(query , (line[0], line[9], line[1], line[4], int(line[10]), int(line[8]), int(line[3])) )
query = "select artist, song, firstname, lastname from table2 WHERE userId= 10 and sessionId = 182"
try:
rows = session.execute(query)
except Exception as e:
print(e)
for row in rows:
print(f'artist: {row.artist}, song: {row.song}, user first name: {row.firstname},user last name: {row.lastname}')
changing table name from 'table2' to 'artist_song' getting error: InvalidRequest: Error from server: code=2200 [Invalid query] message="Undefined column name firstname"
session.execute("""CREATE TABLE IF NOT EXISTS artist_song
(artist text , song text, firstname text , lastname text, userId int ,sessionid int, iteminsession int ,
PRIMARY KEY ((userId, sessionId), itemInSession)) """)
file = 'event_datafile_new.csv'
with open(file, encoding = 'utf8') as f:
csvreader = csv.reader(f)
next(csvreader) # skip header
for line in csvreader:
query = "INSERT INTO artist_song (artist, song, firstname, lastname , userId, sessionId, itemInSession)"
query = query + "VALUES (%s, %s, %s, %s ,%s ,%s ,%s)"
# building the insert
# executing the insertion
session.execute(query , (line[0], line[9], line[1], line[4], int(line[10]), int(line[8]), int(line[3])) )
query = "select artist, song, firstname, lastname from artist_song WHERE userId= 10 and sessionId = 182"
try:
rows = session.execute(query)
except Exception as e:
print(e)
for row in rows:
print(f'artist: {row.artist}, song: {row.song}, user first name: {row.firstname},user last name: {row.lastname}')

Solution found: I'm getting this error when executing my code --> TypeError: not enough arguments for format string

Not sure how to fix this problem. When I used the debugger, the execute function does not work.
# this function updates the record chosen by the user
def update_record(vector_to_update):
logging.debug("Executing: update_record")
try:
with mydb.cursor() as cursor:
# prepared statement to update a record in the database
update_query = ('UPDATE myTable SET ref_date = %s, geo = %s, sex = %s, age_group = %s, '
'student_response = %s, uom = %s, uom_id = %s, scalar_factor = %s, scalar_id = %s, '
'vector = %s, coordinate = %s, value_field = %s, decimals = %s WHERE vector = "%s"')
# calls on the user_input function in the dataModel file and stores input in "data"
data = dataModel.user_input()
# execute the query using the vector_to_update in the query
cursor.execute(update_query, (data, vector_to_update))
# commit changes to database
mydb.commit()
print('Updating data for vector: {}'.format(vector_to_update))
cursor.close()
except pymysql.DatabaseError as error:
# if no connection to database
print("Update record failed to execute {}".format(error))
# tells the user the input is invalid and goes back thru the delete_record function
menu_update_record()

Convert psycopg2 into asyncpg format. "syntax error at or near "%""

I'm converting a postgres script into asyncpg.
im getting "asyncpg.exceptions.PostgresSyntaxError: syntax error at or near "%""
i assume my placeholder format is incorrect but i cant find an example of a correct format.
Original working psycopg2 code:
async def commit_trade_postgres(response_data_input):
conn = await psycopg2.connect(
"dbname='postgres' user='postgres' password = 'postgres123' host='localhost' port= '5432'")
cur = conn.cursor()
cur.execute(
"CREATE TABLE IF NOT EXISTS trade_{symbol} (time timestamptz NOT NULL ,side text, size float, price float, tick_direction text)".format(**response_data_input))
conn.commit()
cur.execute(
"SELECT create_hypertable('trade_{symbol}', 'time', if_not_exists => TRUE)".format(**response_data_input))
conn.commit()
cur.execute("INSERT INTO trade_{symbol} (time, side, size, price, tick_direction) VALUES (now(), %(side)s, %(size)s, %(price)s, %(tick_direction)s)".format(
**response_data_input), (response_data_input))
conn.commit()
print("commited trade")
My attempt as per the example code supplied int he docs:
async def commit_trade_postgres(response_data_input):
conn = await asyncpg.connect(database='postgres', user='postgres', password='postgres123', host='localhost', port='5432')
await conn.execute(
"CREATE TABLE IF NOT EXISTS trade_{symbol} (time timestamptz NOT NULL ,side text, size float, price float, tick_direction text)".format(**response_data_input))
await conn.execute(
"SELECT create_hypertable('trade_{symbol}', 'time', if_not_exists => TRUE)".format(**response_data_input))
await conn.execute("INSERT INTO trade_{symbol} (time, side, size, price, tick_direction) VALUES (now(), %(side)s, %(size)s, %(price)s, %(tick_direction)s)".format(
**response_data_input), (response_data_input))
print("commited trade")
EDIT: Sample Query, Which i'm extracting 'data' as a dict.
response_dict_instrument = {'topic': 'instrument.BTCUSD', 'data': [{'symbol': 'BTCUSD', 'mark_price': 12367.29, 'index_price': 12360.1}]}
You're formatting query by yourself. You never should do that. Also I would suggest you to create table for every incoming symbol beforehand, do not do this dynamically.
Asyncpg template uses $ sign with number to substitute values to query for you. doc
So, syntax should be like this, if input is dictionary.
async def save_input(input):
# create connection
conn = ...
trade_symbol = input['symbol']
query = "create table if not exists trade_{trade_symbol} ... ".format(trade_symbol=trade_symbol) # your column names go here
await conn.execute(query)
query = "SELECT create_hypertable('trade_{trade_symbol} ...".format(trade_symbol=trade_symbol)
await conn.execute(query)
# i'm not copyng your exact keys, you should do it yourself
values = (input['key1'], input['key2'], input['key3'])
query = "insert into trade_{trade_symbol} (key1, key2, key3) values ($1, $2, $3);".format(trade_symbol=trade_symbol)
await conn.execute(query, *values)
await conn.close()

psycopg2 doesn't save data in to postgres database

So I am web scraping from a news site for certain articles. And I am using psycopg2 to connect to postgres database and save data from this article.
with conn.cursor() as cur:
query = """INSERT INTO
articles (title, article_body, author, author_title, source_date, "createdAt", "updatedAt")
VALUES (%s, %s, %s, %s, %s, %s, %s);"""
cur.execute(query, (articleTitle, parsedText, articleAuthor, articleAuthorTitle, articlePostDate, now, now))
cur.execute('SELECT author FROM articles')
rows = cur.fetchall()
print ('')
print (rows)
print ('')
The thing is that when second query is executed, it returns the data from the articles table, but when I make a query through terminal psql it shows that articles table is empty.
Try this. Hope it helps.
with conn.cursor() as cur:
query = """INSERT INTO
articles (title, article_body, author, author_title, source_date, "createdAt", "updatedAt")
VALUES (%s, %s, %s, %s, %s, %s, %s);"""
cur.execute(query, (articleTitle, parsedText, articleAuthor, articleAuthorTitle, articlePostDate, now, now))
conn.commit()
cur.execute('SELECT author FROM articles')
rows = cur.fetchall()
print ('')
print (rows)
print ('')

How to avoid SQL injection if I insert data from CSV file with variables in Python3, Pymsql?

My ENV is:
MySQL(mariadb) DB Version is 5.5.56
Python3 Version is 3.6
Situation:
I have a telephone statistics CSV file that will generate everyday, and I need to insert those data in my MYSQL DB.
Type: Extension Statistic Report,,,,,,,,
From 2018/4/17 上午 12:00:00 To 2018/4/18 上午 12:00:00
Agent Extension: Any number
,,,,,,,,
Agent Extension,,Inbound,,Outbound,,Total,,Total Talking time
,, Answered,Unanswered,Answered,Unanswered,Answered,Unanswered,
100 MeetingRoom,,0,0,0,0,0,0,00:00:00
101 Build,,0,0,0,0,0,0,00:00:00
102 Lead,,0,0,2.00,1.00,2.00,1.00,01:36:09
103 Discover,,0,0,0,0,0,0,00:00:00
105 Fatto,,1.00,0,28.00,9.00,29.00,9.00,01:07:27
106 Meditare,,0,0,0,0,0,0,00:00:00
Total:,,122.00,41.00,152.00,49.00,274.00,90.00,10h 43m 17s
This is my Code:
import csv, sys, os
import pymysql
from datetime import datetime, timedelta
# DB Config
dbconn = pymysql.connect(host='192.168.X.X',
port=3306,
user='root',
passwd='********',
db='test',
charset='utf8')
cursor = dbconn.cursor()
# Get today's date.
def get_date(d):
toDay = timedelta(days=d)
yesDay = datetime.now() + toDay
return yesDay.strftime("%Y%m%d")
# Get today's str value.
yesterday = get_date(-1)
beforeyesterday = get_date(-2)
with open("/Users/fiona/Downloads/statistics_1704_v1nNHbvGjnIQ2mVwsMLr.csv") as file:
readCSV = csv.reader(file)
extensionCodes = [] # Store extension Number
usersName = [] # Store User Name
inboundsAnswered = [] # Store Inbound Answered
inboundsUnanswered = [] # Store Inbound Unanswered
outboundsAnswered = [] # Store Outbound Answered
outboundsUnanswered = [] # Store Outbound Unanswered
totalsAnswered = [] # Store Total Answered
totalsUnanswered = [] # Store Total Unanswered
totalsTalkingTime = [] # Store Total Talking time
for index, rows in enumerate(readCSV):
if index not in range(0, 7) and rows[0] != "":
if str(rows[0])[:3] != "Tot":
extensionCode = str(rows[0])[:3] # Store every rows extension number
elif str(rows[0])[:5] == "Total":
break
userName = rows[0] # Store every rows name
inboundAnswered = float(rows[2])
inboundUnanswered = float(rows[3])
outboundAnswered = float(rows[4])
outboundUnanswered = float(rows[5])
totalAnswered = float(rows[6])
totalUnanswered = float(rows[7])
totalTalkingTime = rows[8]
sql = """
INSERT INTO
test (extension_number, username, inbound_answered, inbound_unanswered,
outbound_answered, outbound_unanswered, total_answered, total_unanswered,
total_talking_time, createtime)
VALUES
(%d, %s, %d, %d, %d, %d, %d, %d, %s, %s);
""" % (int(extensionCode), "'"+userName+"'", int(inboundAnswered), int(inboundUnanswered),
int(outboundAnswered), int(outboundUnanswered), int(totalAnswered),
int(totalUnanswered), "'"+totalTalkingTime+"'", yesterday)
print(sql) # Testing SQL Syntax
cursor.execute(sql)
dbconn.commit()
cursor.close()
dbconn.close()
Using above code I can insert my data into DB, but I also want to save the SQL injection problem. So I have done some research and change my code, but still can not successful.
Python best practice and securest to connect to MySQL and execute queries
How can I escape the input to a MySQL db in Python3?
How to use variables in SQL statement in Python?
Python MySQL Parameterized Queries
Now, I known if I want to avoid SQL injection, I can not use % to get my variable values, I have to use , to get values.
But, I find out that using , seems the values will become str that make my %d will failed.
My DB Design is like:
Picture
Is there anyone who can give me some advice or direction?
Thank you for your help!
Update 1:
if I use reference 4.
sql = """
INSERT INTO test (extension_number, username, inbound_answered, inbound_unanswered, outbound_answered, outbound_unanswered, total_answered, total_unanswered,
total_talking_time, createtime)
VALUES (%d, %s, %d, %d, %d, %d, %d, %d, %s, %s)
""", (int(extensionCode), userName, int(inboundAnswered), int(inboundUnanswered), int(outboundAnswered), int(outboundUnanswered),
int(totalAnswered), int(totalUnanswered), totalTalkingTime, yesterday)
it will shows:
packet = prelude + sql[:packet_size-1]
TypeError: can't concat tuple to bytes
('\n INSERT INTO test (extension_number, username, inbound_answered, inbound_unanswered, \n outbound_answered, outbound_unanswered, total_answered, total_unanswered, \n total_talking_time, createtime)\n VALUES (%d, %s, %d, %d, %d, %d, %d, %d, %s, %s)\n ', (100, 'MeetingRoom', 0, 0, 0, 0, 0, 0, '00:00:00', '20180423'))
Process finished with exit code 1
Update 2:
I tried another way,
sql = "INSERT INTO test (extension_number, username, inbound_answered, inbound_unanswered, " \
"outbound_answered, outbound_unanswered, total_answered, total_unanswered, total_talking_time, " \
"createtime) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", \
(int(extensionCode), userName, int(inboundAnswered), int(inboundUnanswered),
int(outboundAnswered), int(outboundUnanswered), int(totalAnswered),
int(totalUnanswered), totalTalkingTime, yesterday)
cursor.execute(sql)
but, still not working
packet = prelude + sql[:packet_size-1]
TypeError: can't concat tuple to bytes
Update 3:
Finally, I find out the way,
sql = "INSERT INTO test (extension_number, username, inbound_answered, " \
"inbound_unanswered, outbound_answered, outbound_unanswered, " \
"total_answered, total_unanswered, total_talking_time, createtime) " \
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
data = (extensionCode, userName, inboundAnswered, inboundUnanswered,
outboundAnswered, outboundUnanswered, totalAnswered,
totalUnanswered, totalTalkingTime, yesterday)
cursor.execute(sql, data)
So, It seems like if I want to use variable in cursor.execute(), I have to separate the sql syntax and value.
If I want to use sql syntax and value in one line, I have to use cursor.execute(sql syntax and value) directly and double quotes or triple quotes are both fine.
such as:
cursor.execute("""INSERT INTO test (extension_number, username, inbound_answered, inbound_unanswered,
outbound_answered, outbound_unanswered, total_answered, total_unanswered, total_talking_time,
createtime) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
(extensionCode, userName, inboundAnswered, inboundUnanswered, outboundAnswered, outboundUnanswered, totalAnswered, totalUnanswered, totalTalkingTime, yesterday))
separate sql syntax and values or just put them all together in cursor.execute which one is more security?
Thank you for your advice, let me find the right direction!

Resources