I try to append my vertica (SQL-type) table through pandas using sqlalchemy
import pandas as pd
import sqlalchemy as sa
Create engine to vertica:
def get_engine(base):
engine = sa.create_engine("{sys}+{dri}://{user}:" + \
"{password}#{host}:{port}/{database}".format(**login[base]))
return engine
engine = get_engine('vertica')
Just for clarity a simple query:
table = '***'
sql =\
'''
select *
from public.{table}
'''.format(table=table)
connection = engine.connect()
data = pd.read_sql(sql, connection)
connection.close()
Data is not empty:
print(len(data))
569955
And try to write to the same table:
fields = list(data.columns)
connection = engine.connect()
data.to_sql(table, connection, schema='public', index=False, if_exists='append', chunksize=30000,
dtype={fields[0]:sa.types.Integer,
fields[1]:sa.types.VARCHAR,
fields[2]:sa.types.Integer,
fields[3]:sa.types.Integer,
fields[4]:sa.types.Integer,
fields[5]:sa.types.VARCHAR,
fields[6]:sa.types.VARCHAR,
fields[7]:sa.types.VARCHAR,
fields[8]:sa.types.VARCHAR,
fields[9]:sa.types.VARCHAR,
fields[10]:sa.types.VARCHAR,
fields[11]:sa.types.VARCHAR,
fields[12]:sa.types.DateTime
})
connection.close()
And get this mistake:
...
\Anaconda3\lib\site-packages\sqlalchemy\engine\default.py in do_executemany(self, cursor, statement, parameters, context)
465
466 def do_executemany(self, cursor, statement, parameters, context=None):
--> 467 cursor.executemany(statement, parameters)
468
469 def do_execute(self, cursor, statement, parameters, context=None):
\Anaconda3\lib\site-packages\vertica_python\vertica\cursor.py in executemany(self, operation, seq_of_parameters)
153 else:
154 raise NotImplementedError(
--> 155 "executemany is implemented for simple INSERT statements only")
156
157 def fetchone(self):
NotImplementedError: executemany is implemented for simple INSERT statements only
I got the same error when I was trying to write my data to vertica using sqlalchemy. For my case the issue was the column names. It seems that it can't write column names that include special characters. I could fix the error by removing all the '_', '%' and white space characters from column names in pandas and then I used df.to_sql() to write it in vertica.
Related
NOTE: Need to use distributed processing, which is why I am utilizing Pandas API on Spark.
To create the pandas-on-Spark DataFrame, I attempted 2 different methods (outlined below: "OPTION 1", "OPTION 2").
Are either of these options feasible? If so, how do I proceed given errors (outlined below in "ISSUE(S)" and error log for "OPTION 2")?
Alternatively, should I start with PySpark SQL Pandas UDFs for the query, and then convert to pandas-on-Spark DataFrame?
# (Spark 3.2.0, Scala 2.12, DBR 10.0)
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
## I. Import libraries & dependencies
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
import numpy as np
import pandas as pd
import pyspark.pandas as ps
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql import Column
from pyspark.sql.functions import *
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
## II. Load data + create Spark DataFrames
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
df_1 = spark.read.format("snowflake").options(**options).option("query","SELECT PROPERTY_ID,AVGRENT_MARKET FROM schema_1").load()
df_2 = spark.read.format("snowflake").options(**options).option("query","SELECT PROPERTY_ID,PROPERTY_ZIPCODE FROM schema_2").load()
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
## III. OPTION 1: Union Spark DataFrames
## ISSUE(S): Results in 'None' values in PROPERTY_ZIPCODE column
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
## Create merged dataframe from two Spark Dataframes
# df_3 = df_1.unionByName(df_2, allowMissingColumns=True)
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
## III. OPTION 2: Create Spark SQL DataFrame from SQL tables
## ISSUE(S): "AnalysisException: Reference 'PROPERTY_ID' is ambiguous, could be: table_1.PROPERTY_ID, table_2.PROPERTY_ID."
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
## Create tables from two Spark DataFrames
df_1.createOrReplaceTempView("Table_1")
df_2.createOrReplaceTempView("Table_2")
## Specify SQL Snowflake query to merge tables
merge_tables = '''
SELECT Table_1.PROPERTY_ID,
Table_1.AVGRENT_MARKET,
Table_2.PROPERTY_ID,
Table_2.PROPERTY_ZIPCODE
FROM Table_2 INNER JOIN Table_1
ON Table_2.PROPERTY_ID=Table_1.PROPERTY_ID
LIMIT 25
'''
## Create merged Spark SQL dataframe based on query
df_3 = spark.sql(merge_tables)
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
## Create a pandas-on-Spark DataFrame
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
df_3 = ps.DataFrame(df_3)
# df_3 = df_3.to_pandas_on_spark() # Alternative conversion option
Error log for "OPTION 2":
---------------------------------------------------------------------------
AnalysisException Traceback (most recent call last)
<command-2142959205032388> in <module>
52 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
53 # df_3 = ps.DataFrame(df_3)
---> 54 df_3 = df_3.to_pandas_on_spark() # Alternative conversion option
/databricks/spark/python/pyspark/sql/dataframe.py in to_pandas_on_spark(self, index_col)
2777
2778 index_spark_columns, index_names = _get_index_map(self, index_col)
-> 2779 internal = InternalFrame(
2780 spark_frame=self, index_spark_columns=index_spark_columns, index_names=index_names
2781 )
/databricks/spark/python/pyspark/pandas/internal.py in __init__(self, spark_frame, index_spark_columns, index_names, index_fields, column_labels, data_spark_columns, data_fields, column_label_names)
633
634 # Create default index.
--> 635 spark_frame = InternalFrame.attach_default_index(spark_frame)
636 index_spark_columns = [scol_for(spark_frame, SPARK_DEFAULT_INDEX_NAME)]
637
/databricks/spark/python/pyspark/pandas/internal.py in attach_default_index(sdf, default_index_type)
865
866 if default_index_type == "sequence":
--> 867 return InternalFrame.attach_sequence_column(sdf, column_name=index_column)
868 elif default_index_type == "distributed-sequence":
869 return InternalFrame.attach_distributed_sequence_column(sdf, column_name=index_column)
/databricks/spark/python/pyspark/pandas/internal.py in attach_sequence_column(sdf, column_name)
878 #staticmethod
879 def attach_sequence_column(sdf: SparkDataFrame, column_name: str) -> SparkDataFrame:
--> 880 scols = [scol_for(sdf, column) for column in sdf.columns]
881 sequential_index = (
882 F.row_number().over(Window.orderBy(F.monotonically_increasing_id())).cast("long") - 1
/databricks/spark/python/pyspark/pandas/internal.py in <listcomp>(.0)
878 #staticmethod
879 def attach_sequence_column(sdf: SparkDataFrame, column_name: str) -> SparkDataFrame:
--> 880 scols = [scol_for(sdf, column) for column in sdf.columns]
881 sequential_index = (
882 F.row_number().over(Window.orderBy(F.monotonically_increasing_id())).cast("long") - 1
/databricks/spark/python/pyspark/pandas/utils.py in scol_for(sdf, column_name)
590 def scol_for(sdf: SparkDataFrame, column_name: str) -> Column:
591 """Return Spark Column for the given column name."""
--> 592 return sdf["`{}`".format(column_name)]
593
594
/databricks/spark/python/pyspark/sql/dataframe.py in __getitem__(self, item)
1657 """
1658 if isinstance(item, str):
-> 1659 jc = self._jdf.apply(item)
1660 return Column(jc)
1661 elif isinstance(item, Column):
/databricks/spark/python/lib/py4j-0.10.9.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
121 # Hide where the exception came from that shows a non-Pythonic
122 # JVM exception message.
--> 123 raise converted from None
124 else:
125 raise
AnalysisException: Reference 'PROPERTY_ID' is ambiguous, could be: table_1.PROPERTY_ID, table_2.PROPERTY_ID.
If all you want is just a join, then use Spark join function instead. It's much cleaner and maintainable.
df_1 = spark.read...load()
df_2 = spark.read...load()
df_3 = df_1.join(df_2, on=['PROPERTY_ID'], how='inner')
I have a small pyhton code that build a dataframe with one (or more) nans and then write it to a postgres database with psycopg2 module using copy_from function. Here it is:
table_name = "test"
df = pd.DataFrame([[1.0, 2.0], [3.0, np.nan]], columns=["VALUE0", "VALUE1"], index=pd.date_range("2000-01-01", "2000-01-02"))
database = "xxxx"
user = "xxxxxxx"
password = "xxxxxx"
host = "127.0.0.1"
port = "xxxxx"
def nan_to_null(f,
_NULL=psycopg2.extensions.AsIs('NULL'),
_NaN=np.NaN,
_Float=psycopg2.extensions.Float):
if f != f:
return _NULL
else:
return _Float(f)
psycopg2.extensions.register_adapter(float, nan_to_null)
psycopg2.extensions.register_adapter(np.float, nan_to_null)
psycopg2.extensions.register_adapter(np.float64, nan_to_null)
with psycopg2.connect(database=database,
user=user,
password=password,
host=host,
port=port) as conn:
try:
with conn.cursor() as cur:
cmd = "CREATE TABLE {} (TIMESTAMP timestamp PRIMARY KEY NOT NULL, VALUE0 FLOAT, VALUE1 FLOAT)"
cur.execute(sql.SQL(cmd).format(sql.Identifier(table_name)))
buffer = StringIO()
df.to_csv(buffer, index_label='TIMESTAMP', header=False)
buffer.seek(0)
cur.copy_from(buffer, table_name, sep=",")
conn.commit()
except Exception as e:
conn.rollback()
logging.error(traceback.format_exc())
raise e
The problème is that psycopg2 fail to transform nan into posgres NULL, although I have used this trick:
How do I convert numpy NaN objects to SQL nulls?
(the nan_to_null function).
I cannot make it work, it throws the following exception:
psycopg2.errors.InvalidTextRepresentation: invalid input syntax for type double precision: ""
CONTEXT: COPY test, line 2, column value1: ""
I am using python 3.8 on windows 10 with anaconda 3, psycopg2 v2.8.5 and postgres v12.3.
Thanks!
I put here the same code with the solution updated of Adrian Klaver.
The line that changed is:
df.to_csv(buffer, index_label='TIMESTAMP', header=False, na_rep='NaN')
We've added na_rep='NaN' in to_csv function. No need to replace nans with another line of code. replacing with 'NULL' does not work.
import psycopg2, logging, numpy as np, pandas as pd
from psycopg2 import sql
import traceback
from io import StringIO
if __name__ == '__main__':
table_name = "test"
df = pd.DataFrame([[1.0, 2.0], [3.0, np.nan]], columns=["VALUE0", "VALUE1"], index=pd.date_range("2000-01-01", "2000-01-02"))
database = "xxxxxx"
user = "xxxxx"
password = "xxxxxx"
host = "127.0.0.1"
port = "xxxxxx"
with psycopg2.connect(database=database,
user=user,
password=password,
host=host,
port=port) as conn:
try:
with conn.cursor() as cur:
#Creating a new table test
cmd = "CREATE TABLE {} (TIMESTAMP timestamp PRIMARY KEY NOT NULL, VALUE0 FLOAT, VALUE1 FLOAT);"
cur.execute(sql.SQL(cmd).format(sql.Identifier(table_name)))
#Writting content
buffer = StringIO()
df.to_csv(buffer, index_label='TIMESTAMP', header=False, na_rep='NaN')
buffer.seek(0)
cur.copy_from(buffer, table_name, sep=",")
#Reading the table content
cmd = "SELECT * FROM {};"
cur.execute(sql.SQL(cmd).format(sql.Identifier(table_name)))
test_data = pd.DataFrame(cur.fetchall())
print(test_data)
print(type(test_data.loc[1, 2]))
#Deleting the test table
cmd = "DROP TABLE {};"
cur.execute(sql.SQL(cmd).format(sql.Identifier(table_name)))
conn.commit()
except Exception as e:
conn.rollback()
logging.error(traceback.format_exc())
raise e
The prints shows that nan is well interpreted and stored in the DB.
The issue is the use of copy_from. From the docs:
Currently no adaptation is provided between Python and PostgreSQL types on COPY: ...
So your adapter does not come into play.
UPDATE A possible solution:
Pandas Changing the format of NaN values when saving to CSV
See #cs95 answer.
It seems you are inserting empty string instead of NULL value, you can easily reproduce you error with the following SQL code:
CREATE TABLE test(
x FLOAT
);
INSERT INTO test(x) VALUES ('');
-- ERROR: invalid input syntax for type double precision: "" Position: 29
On the other hand, NaN can be safely inserted into PostgreSQL:
INSERT INTO test(x) VALUES ('NaN');
Notice PostgreSQL float support slightly differs from IEEE 754 standards because PostresSQL needs all value to be orderable to consistently build index. Therefore NaN is greater or equal to any other number including itself in PostgreSQL.
Thanks to Adrian Klaver and jlandercy answer, the solution is simple... replace np.nan by 'NaN' manually with the following line that replace the nan_to_null function:
'''
df.replace(np.nan, "NaN", inplace=True)
'''
And it works fine. Thank you guys!
Add na_rep='NaN' when you write your csv file.
If you are using this in conjunction with psycopg2's copy_expert method, you may need to also add the null = "NaN" param to your postgres syntax so that the null representations match up.
Here's an example:
df.to_csv(csv_filename, index=False, na_rep='NaN')
string = sql.SQL("""
copy {}
from stdin (
format csv,
null "NaN",
delimiter ',',
header
)
""").format(sql.Identifier(table_name))
I have trouble doing a language detection.
The code below raises an Exception Error.
from langdetect import detect
for row in df['Comments']:
text = str(row)
language_code = detect(text)
sentence = [all_languages_codes.get(language_code)]
df['Language']=sentence[0]
Error Message:
148 ngrams = self._extract_ngrams()
149 if not ngrams:
--> 150 raise LangDetectException(ErrorCode.CantDetectError, 'No features in text.')
151
152 self.langprob = [0.0] * len(self.langlist)
LangDetectException: No features in text.
How to print-out the row that causes the LangDetectException?
It looks like your Contents string is empty:
detect("")
LangDetectException: No features in text.
You can launch a debugger or interactive shell to know for sure, wrapping this in a try/except block and launching a debugger when an exception is raised:
from langdetect import detect
for row in df['Comments']:
try:
text = str(row)
language_code = detect(text)
sentence = [all_languages_codes.get(language_code)]
df['Language']=sentence[0]
except Exception:
import ipdb; ipdb.set_trace()
Edit 12/07/19: The problem was not in fact with pd.rename fuction but the fact that I did not return from the function the pandas dataframe and as a result the column change did not exist when printing. i.e.
def change_column_names(as_pandas, old_name, new_name):
as_pandas.rename(columns={old_name: new_name}, inplace=)
return as_pandas <- This was missing*
Please see the user comment below to uptick them for finding this error for me.
Alternatively, you can continue reading.
The data can be downloaded from this link, yet I have added a sample dataset. The formatting of the file is not a typical CSV file and I believe this may have been an assessment piece and is related to Hidden Decision Tree article. I have given the portion of the code as it solves the issues surrounding the format of the text file as mentioned above and allows the user to rename the column.
The problem occured when I tried to assign create a re-naming function:
def change_column_names(as_pandas, old_name, new_name):
as_pandas.rename(columns={old_name: new_name}, inplace=)
However, it seem to work when I set the variable names inside rename function.
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'}, inplace=True)
return as_pandas
Sample Dataset
Title URL Date Unique Pageviews
oupUrl=tutorials 18-Apr-15 5608
"An Exclusive Interview with Data Expert, John Bottega" http://www.datasciencecentral.com/forum/topics/an-exclusive-interview-with-data-expert-john-bottega?groupUrl=announcements 10-Jun-14 360
Announcing Composable Analytics http://www.datasciencecentral.com/forum/topics/announcing-composable-analytics 15-Jun-14 367
Announcing the release of Spark 1.5 http://www.datasciencecentral.com/forum/topics/announcing-the-release-of-spark-1-5 12-Sep-15 156
Are Extreme Weather Events More Frequent? The Data Science Answer http://www.datasciencecentral.com/forum/topics/are-extreme-weather-events-more-frequent-the-data-science-answer 5-Oct-15 204
Are you interested in joining the University of California for an empiricalstudy on 'Big Data'? http://www.datasciencecentral.com/forum/topics/are-you-interested-in-joining-the-university-of-california-for-an 7-Feb-13 204
Are you smart enough to work at Google? http://www.datasciencecentral.com/forum/topics/are-you-smart-enough-to-work-at-google 11-Oct-15 3625
"As a software engineer, what's the best skill set to have for the next 5-10years?" http://www.datasciencecentral.com/forum/topics/as-a-software-engineer-what-s-the-best-skill-set-to-have-for-the- 12-Feb-16 2815
A Statistician's View on Big Data and Data Science (Updated) http://www.datasciencecentral.com/forum/topics/a-statistician-s-view-on-big-data-and-data-science-updated-1 21-May-14 163
A synthetic variance designed for Hadoop and big data http://www.datasciencecentral.com/forum/topics/a-synthetic-variance-designed-for-hadoop-and-big-data?groupUrl=research 26-May-14 575
A Tough Calculus Question http://www.datasciencecentral.com/forum/topics/a-tough-calculus-question 10-Feb-16 937
Attribution Modeling: Key Analytical Strategy to Boost Marketing ROI http://www.datasciencecentral.com/forum/topics/attribution-modeling-key-concept 24-Oct-15 937
Audience expansion http://www.datasciencecentral.com/forum/topics/audience-expansion 6-May-13 223
Automatic use of insights http://www.datasciencecentral.com/forum/topics/automatic-use-of-insights 27-Aug-15 122
Average length of dissertations by higher education discipline. http://www.datasciencecentral.com/forum/topics/average-length-of-dissertations-by-higher-education-discipline 4-Jun-15 1303
This is the full code that produces the Key Error:
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'}, inplace=True)
def change_column_names(as_pandas, old_name, new_name):
as_pandas.rename(columns={old_name: new_name}, inplace=True)
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'},
inplace=True)
def open_as_dataframe(file_name_in):
reader = pd.read_csv(file_name_in, encoding='windows-1251')
return reader
# Get each column of data including the heading and separate each element
i.e. Title, URL, Date, Page Views
# and save to string_of_rows with comma separator for storage as a csv
# file.
def get_columns_of_data(*args):
# Function that accept variable length arguments
string_of_rows = str()
num_cols = len(args)
try:
if num_cols > 0:
for number, element in enumerate(args):
if number == (num_cols - 1):
string_of_rows = string_of_rows + element + '\n'
else:
string_of_rows = string_of_rows + element + ','
except UnboundLocalError:
print('Empty file \'or\' No arguments received, cannot be zero')
return string_of_rows
def open_file(file_name):
try:
with open(file_name) as csv_file_in, open('HDT_data5.txt', 'w') as csv_file_out:
csv_read = csv.reader(csv_file_in, delimiter='\t')
for row in csv_read:
try:
row[0] = row[0].replace(',', '')
csv_file_out.write(get_columns_of_data(*row))
except TypeError:
continue
print("The file name '{}' was successfully opened and read".format(file_name))
except IOError:
print('File not found \'OR\' Not in current directory\n')
# All acronyms used in variable naming correspond to the function at time
# of return from function.
# csv_list being a list of the v file contents the remainder i.e. 'st' of
# csv_list_st = split_title().
def main():
open_file('HDTdata3.txt')
multi_sets = open_as_dataframe('HDT_data5.txt')
# change_column_names(multi_sets)
change_column_names(multi_set, 'Old_Name', 'New_Name')
print(multi_sets)
main()
I cleaned up your code so it would run. You were changing the column names but not returning the result. Try the following:
import pandas as pd
import numpy as np
import math
def set_new_columns(as_pandas):
titles_list = ['Year > 2014', 'Forum', 'Blog', 'Python', 'R',
'Machine_Learning', 'Data_Science', 'Data',
'Analytics']
for number, word in enumerate(titles_list):
as_pandas.insert(len(as_pandas.columns), titles_list[number], 0)
def title_length(as_pandas):
# Insert new column header then count the number of letters in 'Title'
as_pandas.insert(len(as_pandas.columns), 'Title_Length', 0)
as_pandas['Title_Length'] = as_pandas['Title'].map(str).apply(len)
# Although it is log, percentage of change is inverse linear comparison of
#logX1 - logX2
# therefore you could think of it as the percentage change in Page Views
# map
# function allows for function to be performed on all rows in column
# 'Page_Views'.
def log_page_view(as_pandas):
# Insert new column header
as_pandas.insert(len(as_pandas.columns), 'Log_Page_Views', 0)
as_pandas['Log_Page_Views'] = as_pandas['Page_Views'].map(lambda x: math.log(1 + float(x)))
def change_to_numeric(as_pandas):
# Check for missing values then convert the column to numeric.
as_pandas = as_pandas.replace(r'^\s*$', np.nan, regex=True)
as_pandas['Page_Views'] = pd.to_numeric(as_pandas['Page_Views'],
errors='coerce')
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'}, inplace=True)
return as_pandas
def open_as_dataframe(file_name_in):
reader = pd.read_csv(file_name_in, encoding='windows-1251')
return reader
# Get each column of data including the heading and separate each element
# i.e. Title, URL, Date, Page Views
# and save to string_of_rows with comma separator for storage as a csv
# file.
def get_columns_of_data(*args):
# Function that accept variable length arguments
string_of_rows = str()
num_cols = len(args)
try:
if num_cols > 0:
for number, element in enumerate(args):
if number == (num_cols - 1):
string_of_rows = string_of_rows + element + '\n'
else:
string_of_rows = string_of_rows + element + ','
except UnboundLocalError:
print('Empty file \'or\' No arguments received, cannot be zero')
return string_of_rows
def open_file(file_name):
import csv
try:
with open(file_name) as csv_file_in, open('HDT_data5.txt', 'w') as csv_file_out:
csv_read = csv.reader(csv_file_in, delimiter='\t')
for row in csv_read:
try:
row[0] = row[0].replace(',', '')
csv_file_out.write(get_columns_of_data(*row))
except TypeError:
continue
print("The file name '{}' was successfully opened and read".format(file_name))
except IOError:
print('File not found \'OR\' Not in current directory\n')
# All acronyms used in variable naming correspond to the function at time
# of return from function.
# csv_list being a list of the v file contents the remainder i.e. 'st' of
# csv_list_st = split_title().
def main():
open_file('HDTdata3.txt')
multi_sets = open_as_dataframe('HDT_data5.txt')
multi_sets = change_column_names(multi_sets)
change_to_numeric(multi_sets)
log_page_view(multi_sets)
title_length(multi_sets)
set_new_columns(multi_sets)
print(multi_sets)
main()
I'm experiencing some issue with py2neo and spark-driver since i could not insert node inside a foreach loop or map loop .Like the code below for exemple.
from py2neo import authenticate, Graph, cypher, Node
from pyspark import broadcast
infos=df.rdd
authenticate("localhost:7474", "neo4j", "admin")
graph = Graph(password='admin')
tx = graph.begin()
def node(row):
query = Node("item", event_id=row[0], text=row[19])
tx.create(query)
infos.foreach(node)
tx.commit()
here is the end of the stack trace :
/usr/local/apache/spark-2.2.1-bin-hadoop2.6/python/pyspark/rdd.py in _wrap_function(sc, func, deserializer, serializer, profiler)
2386 assert serializer, "serializer should not be empty"
2387 command = (func, profiler, deserializer, serializer)
-> 2388 pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
2389 return sc._jvm.PythonFunction(bytearray(pickled_command), env, includes, sc.pythonExec,
2390 sc.pythonVer, broadcast_vars, sc._javaAccumulator)
/usr/local/apache/spark-2.2.1-bin-hadoop2.6/python/pyspark/rdd.py in _prepare_for_python_RDD(sc, command)
2372 # the serialized command will be compressed by broadcast
2373 ser = CloudPickleSerializer()
-> 2374 pickled_command = ser.dumps(command)
2375 if len(pickled_command) > (1 << 20): # 1M
2376 # The broadcast will have same life cycle as created PythonRDD
/usr/local/apache/spark-2.2.1-bin-hadoop2.6/python/pyspark/serializers.py in dumps(self, obj)
462
463 def dumps(self, obj):
--> 464 return cloudpickle.dumps(obj, 2)
465
466
/usr/local/apache/spark-2.2.1-bin-hadoop2.6/python/pyspark/cloudpickle.py in dumps(obj, protocol)
702
703 cp = CloudPickler(file,protocol
I think i can't pass the parameter tx inside the loop.
We try to overpass this issue by instanciating a connection directly inside the loop like the code bellow . It works for small matrix but when i try with a 20 million row one it stop at some point
from py2neo import authenticate, Graph, cypher, Node
infos=df.rdd
authenticate("localhost:7474", "neo4j", "password")
def node(row):
graph = Graph(password='admin')
tx = graph.begin()
query = Node("item", event_id=row[0], text=row[19])
tx.create(query)
tx.commit()
infos.foreach(node)
I made some research about the neo4j-spark connector it seems that you could add the library but there is no example provided and i'm not sure at all that such functionnality are actually provided in python. What would be the best way to solve this problem ?
A standard pattern to resolve this type of issues is to use foreachPartition:
def nodes(rows):
graph = Graph(password='admin')
tx = graph.begin()
for row in rows:
query = Node("item", event_id=row[0], text=row[19])
tx.create(query)
tx.commit()
infos.foreachPartition(nodes)