AWS --extra-py-files throwing ModuleNotFoundError: No module named 'pg8000' - python-3.x

I am trying to use pg8000 in my Glue Script, following are params in Glue Job
--extra-py-files s3://mybucket/pg8000libs.zip //NOTE: my zip contains __init__.py
Some Insights towards code
import sys
import os
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import boto3
from pyspark.sql import Row
from datetime import datetime, date
zip_path = os.path.join('/tmp', 'pg8000libs.zip')
sys.path.insert(0, zip_path)
def dump_python_path():
print("python path:", sys.path)
for path in sys.path:
if os.path.isdir(path):
print(f"dir: {path}")
print("\t" + str(os.listdir(path)))
print(path)
print(os.listdir('/tmp'))
dump_python_path()
# Import the library
import pg8000
Dump in cloudwatch
python path: ['/tmp/pg8000libs.zip', '/opt/amazon/bin', '/tmp/pg8000libs.zip', '/opt/amazon/spark/jars/spark-core_2.12-3.1.1-amzn-0.jar', '/opt/amazon/spark/python/lib/pyspark.zip', '/opt/amazon/spark/python/lib/py4j-0.10.9-src.zip', '/opt/amazon/lib/python3.6/site-packages', '/usr/lib64/python37.zip', '/usr/lib64/python3.7', '/usr/lib64/python3.7/lib-dynload', '/home/spark/.local/lib/python3.7/site-packages', '/usr/lib64/python3.7/site-packages', '/usr/lib/python3.7/site-packages']

Related

How to write pyspark dataframe directly into S3 bucket?

I want to save pyspark dataframe directly into s3 bucket. I tried some options but getting error. Can someone help me to solve my problem?
I created one sample pyspark dataframe and tried to save in S3 bucket directly.
I tried below code-
from pyspark.context import SparkContext
from pyspark.sql import HiveContext
from pyspark.sql.functions import *
from pyspark.sql import SQLContext
from pyspark.sql.window import Window
import pyspark.sql.functions as func
from pyspark.sql.functions import last
from pyspark.sql import functions as F
from pyspark.sql.functions import lit
from pyspark.sql.functions import col
from pyspark.sql.functions import unix_timestamp
from functools import reduce
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import max
from pyspark.sql.types import *
from pyspark.sql import DataFrame
from pyspark.sql.functions import broadcast
from pyspark.sql.functions import dense_rank
from pyspark.sql.window import Window
from pyspark.sql.functions import abs, lit
#from __future__ import division
import sys
import mysql.connector
import traceback
import json
#from sqlalchemy import create_engine
import os
import math
import os.path
import datetime
from os import getpid
import pymysql.cursors
import time
import signal
from bs4 import BeautifulSoup
import pandas as pd
from pyspark.context import SparkConf
from collections import OrderedDict
import multiprocessing
import multiprocessing as mp
from multiprocessing import Pool
from multiprocessing.pool import ThreadPool
from threading import Thread
from functools import partial
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email.mime.application import MIMEApplication
from email import encoders
import smtplib
import shutil
import glob
from datetime import datetime, date
from pyspark.sql import Row
spark = SparkSession.builder.appName("app_name").getOrCreate()
print(spark.sparkContext._gateway.jvm.org.apache.hadoop.util.VersionInfo.getVersion())
sc = spark.sparkContext
aws_access_key_id="*******"
aws_secret_access_key="********"
spark._jsc.hadoopConfiguration().set("fs.s3.awsAccessKeyId", aws_access_key_id)
spark._jsc.hadoopConfiguration().set("fs.s3.awsSecretAccessKey", aws_secret_access_key)
spark._jsc.hadoopConfiguration().set("fs.s3.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
spark._jsc.hadoopConfiguration().set('fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.DefaultAWSCredentialsProviderChain')
df = spark.createDataFrame([Row(a=1, b=4., c='GFG1', d=date(2000, 8, 1),e=datetime(2000, 8, 1, 12, 0)),
Row(a=2, b=8., c='GFG2', d=date(2000, 6, 2),e=datetime(2000, 6, 2, 12, 0)),
Row(a=4, b=5., c='GFG3', d=date(2000, 5, 3),e=datetime(2000, 5, 3, 12, 0))])
print(df.show())
print(df.printSchema())
df.write.format('csv').option('header','true').save('s3a://******/testing_s3/emp.csv',mode='overwrite')
After running this code I am getting below error-
py4j.protocol.Py4JJavaError: An error occurred while calling o48.save.
: com.amazonaws.services.s3.model.AmazonS3Exception: Status Code: 403, AWS Service: Amazon S3, AWS Request ID: RNKTVM6JMDACZ16W, AWS Error Code: null, AWS Error Message: Forbidden, S3 Extended Request ID: MS8lToBlzqSmn1YDdq6SPh7JC6aCKSROuldEz5x9LbsnQdxhKVEQriOpJz5KkCJPBnlk4KgsCkQ=
Please tell me what are the things I am missing in my script. Thanks in advance!!
after creating the spark context use these lines to set the credentials
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", AWS_ACCESS_KEY_ID)
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY)
or
import pyspark
conf = (
pyspark.SparkConf()
.setAppName('app_name')
.setMaster(SPARK_MASTER)
.set('spark.hadoop.fs.s3a.access.key', AWS_ACCESS_KEY)
.set('spark.hadoop.fs.s3a.secret.key', AWS_SECRET_KEY)
)
sc = pyspark.SparkContext(conf=conf)

Unable to add/import additional python library datacompy in aws glue

i am trying to import additional python library - datacompy in to the glue job which use version 2 with below step
Open the AWS Glue console.
Under Job parameters, added the following:
For Key, added --additional-python-modules.
For Value, added datacompy==0.7.3, s3://python-modules/datacompy-0.7.3.whl.
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import datacompy
from py4j.java_gateway import java_import
SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"
## #params: [JOB_NAME, URL, ACCOUNT, WAREHOUSE, DB, SCHEMA, USERNAME, PASSWORD]
args = getResolvedOptions(sys.argv, ['JOB_NAME', 'URL', 'ACCOUNT', 'WAREHOUSE', 'DB', 'SCHEMA','additional-python-modules'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
but the job return the error
module not found error no module named 'datacompy'
how to resolve this issue?
With Spark 2.4, Python 3 (Glue Version 2.0)
I set the following Job Parameter
Then I can import it my Job like so
import pandas as pd
import numpy as np
import datacompy
df1 = pd.DataFrame(np.random.randn(10,2), columns=['a','b'])
df2 = pd.DataFrame(np.random.randn(10,2), columns=['a','b'])
compare = datacompy.Compare(df1, df2, join_columns='a')
print(compare.report())
and when I check the CW Log for the Job Run
If you're using a Python Shell Job, try the following
Create a datacompy whl file or you can download it from PYPI
upload that file to an S3 bucket
Then enter the path to the s3 whl file in the Python library path box
s3://my-bucket/datacompy-0.8.0-py3-none-any.whl

How can I solve this problem in vs code, ipython (tensorflow)?

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import check_util.checker as checker
from IPython.display import clear_output
from PIL import Image
import os
import time
import re
from glob import glob
import shutil
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow import keras
print('tensorflow version: {}'.format(tf.__version__))
print('GPU available: {}'.format(tf.test.is_gpu_available()))
When I run this, there is no error. But it shows as indefinitely in progress.
(Current 630..631..632...sec)

ImportError: No module named pyspark_llap

Below is my main code which I want to UnitTest
get_data.py
from pyspark.sql import SparkSession
from pyspark_llap.sql.session import HiveWarehouseSession
def get_hive_data(query):
hive_data = hive.executeQuery(query)
return hive_data
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("HiveApp")\
.getOrCreate()
hive=HiveWarehouseSession.session(spark).build()
data = get_hive_data()
Below is my unittest code, I have written only the imports here, since i get error when i do from get_data import /*/
test.py
import unittest
import pyspark
import pyspark.sql.functions as f
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from get_data import *
ERROR
ImportError: No module named pyspark_llap
But if i run just get_data.py, it runs successfully
I am running it on edge node!

Problems with pyinstaller doesn´t show all things that I created

I've been working on an app to manage a little store with Python and SQLite. For GUI I used tkinter. I've written 13 Python scripts to make the app work. One of them call "principal.py" which imports most of the scripts and the other scripts import other modules like tkcalendar or xlwt (to write excel sheets), etc...
This is an example from "principal.py" first lines of code
#===========IMPORTAR PAQUETES=============
import sqlite3
from sqlite3 import Error
import os
import sys
import tkinter as tk
from tkinter import ttk
from tkinter import *
from tkinter import scrolledtext
from tkinter import Menu
from tkinter import messagebox as msg
import tkcalendar
from tkcalendar import *
import conexion as cnx
import proveedores
from proveedores import Proveedores
from proveedores import *
import productos
from productos import Categoria_Productos
from productos import *
import clientes
from clientes import Clientes
from clientes import *
import colaboradores
from colaboradores import Colaboradores
from colaboradores import *
import herramientas
from herramientas import Roles
from herramientas import Usuarios
from herramientas import *
import recepciondoc
from recepciondoc import Recepcion
from recepciondoc import *
import ventas
from ventas import Ventas
from ventas import *
import ingresos
from ingresos import *
import movimientos
from movimientos import *
import excel_ingresos
from excel_ingresos import *
import excel_venta
from excel_venta import *
This is how my files are structured and how it works when I run the script from vscode
But when I tried to make an exe from my little project using PyInstaller, it looks incomplete like this:
this is how it looks like when I run the exe file from the "dist" folder
I worked like this
Create the spec file with pyi-makespec principal.py
Re-write the principal.spec to save the database path
pyinstaller --windowed principal.spec

Resources