I have difficulty calling one class function output in another class. Below is the sample block of the code:
import pandas as pd
# First Class
class ppl():
def create_ppl(self):
ppll_info = pd.DataFrame({
'Name':['Bob','Amy'],
'Age':[12,19],
'Gender':['male','female']
})
instructor_info = pd.DataFrame({
'Name_Inst':['Tobby','John'],
'Age':[41,37],
'Gender':['male','male']
})
pl = ppl()
pl.create_ppl()
# second Class
class robot():
def create_rbt(self):
rbt_info = pd.DataFrame({
'Name':['Robot_1','Robt_2'],
'Manufacture':['ABC','XYZ'],
'Owner':['Bob','Amy']
})
full_table = pd.merge(
rbt_info,
pl.create_ppl(),
left_on='Owner',
right_on='Name', how='left'
)
return full_table
rbt = robot()
rbt.create_rbt()
I received the error TypeError: Can only merge Series or DataFrame objects, a <class 'NoneType'> was passed
It will work if I add return ppll_info to the end of the first class. Because my actual output dataframe is really large and I don't want to return the output. Is there any more elegant way to do so?
Thank you in advance for the help
Related
I have just started learning the spark-specific functions in python. I am writing one program that reads, transforms, and writes the data. I am using the transform function to do the transformation.
I have just started learning the spark-specific functions in python. I am writing one program that reads, transforms, and writes the data. I am using the transform function to do the transformation.
import logging
import sys
from datetime import datetime
from pyspark.sql import SparkSession
class Ingest:
def __init__(self):
self.spark_session = (
SparkSession.builder.master("cluster")
.appName("IngestData")
.getOrCreate()
)
self.input_directory = ("/mnt/input")
self.output_directory = ("/mnt/output")
def applyLogic(self):
df.cache()
listValues = ["X", "Y"]
df = df.withColumn(
"name",
when(
(df["Title"].contains("Gemini"))
& (df["Title"].contains("Cancer")),
"Intelligent"
)
return df
obj = Ingest()
transformedDF = obj.transform(applyLogic)
Getting an error while using the transform function. Any pointers on how to call the transform function using a class object or I am doing something wrong?I am able to run it without using the transform function
I was able to resolve it after making the below change
obj = Ingest()
transformedDF = df.transform(obj.applyLogic)
Pandas allows you to extend its DataFrame class by using the pd.api.extensions.register_dataframe_accessor() decorator.
While this is functional, it doesn't offer any additional type hinting capabilities.
For example, I would expect the following to type check OK and even provide type hints
import pandas as pd
#pd.api.extensions.register_dataframe_accessor('dataset')
class Extension:
def __init__(self, df: pd.DataFrame):
self._df = df
def foo(self, bar) -> str:
return "foobar";
foo = pd.DataFrame({"foo":["bar"]})
foo.dataset.foo("bar")
^
No Suggestions
How can I get dataframe accessors to provide autocomplete?
This can be done somewhat hackishly using typing.TYPE_CHECKING and a bit of inheritance.
from typing import TYPE_CHECKING
import pandas as pd
#pd.api.extensions.register_dataframe_accessor('dataset')
class Extension:
def __init__(self, df: pd.DataFrame):
self._df = df
def foo(self, bar) -> str:
return "foobar";
if TYPE_CHECKING:
class DataFrame(pd.DataFrame):
dataset: Extension
foo: 'DataFrame' = pd.DataFrame({"foo":["bar"]})
# ^ you have to do this every time you transform the DataFrame
foo.dataset.foo("bar")
# ^ autocomplete is now provided
Unfortunately, PyCharm does not check the __annotations__ dictionary, or really do any dynamic type checking, so there doesn't appear to be any more universal solutions.
I have set up a function and a class method in python 3.6 that both fetch the boston data set. In the latter case the boston data set is stored as a class attribute of the object.
The 'Bunch' type is converted to a pandas dataframe in the exact same manner in both instances.
When I inspect both in the pycharm debugger, using the View as Array / Data Frame functionality of pycharm...
https://www.jetbrains.com/help/pycharm/viewing-as-array.html
...I can view the df that results by calling the function and assigning the output to a variable, but not the df that is assigned to the class attribute of the instantiated object.
from sklearn.datasets import load_boston
import pandas as pd
# define function to get boston data
def get_boston():
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['target'] = boston.target
return df
class MyData:
"""
Object with boston data as attribute
"""
def __init__(self, raw_data=None, processed_data=None):
self.raw_data = raw_data
self.processed_data = processed_data
def get_data(self):
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['target'] = boston.target
self.raw_data = df
my_data_1 = MyData()
my_data_1.get_data()
my_data_2 = get_boston()
print(my_data_1.raw_data.head(5))
When I run the debugger session, I can inspect the my_data_2 variable with the 'View as Data Frame' function in pycharm, but when I do the same for the class attribute raw_data, the 'View as Data Frame' dialog window shows - 'nothing to show', even though I can print it's content to the console properly (using pandas.DataFrame.head() method)
I am using PyCharm 2018.3.3 on Ubuntu Linux.
I'm working with The Guardian Python API, through TheGuardian.search_content() and TheGuardian.data_to_csv() methods. The first one returns a dictionary, called json_content. The second one, iterate over json_content and write its content on a CSV file. My question is, why TheGuardian.data_to_csv() method receives <class 'script_guardian.TheGuardian'> instead <class 'dict'>?
If I have understood correctly,TheGuardian.data_to_csv() just receives an instance of TheGuardian class. It has to do with the type of method I'm using? E.g, abstract, static or class method
Steps:
>>> from script_guardian import TheGuardian
>>> tg = TheGuardian('2016-01-01', '2018-01-01')
>>> json_content = tg.search_content('education', 'relevance', 'education')
>>> json_content
<bound method Content.get_content_response of <theguardian.theguardian_content.Content object at 0x7f7bb9764c88>>
>>> type(json_content)
<class 'method'>
How can I get search_content's return instead the method itself?
My full code:
import requests
from theguardian import theguardian_content
import csv
class TheGuardian:
def __init__(self, from_date='2016-01-01', to_date='2018-01-01'):
self.from_date = from_date
self.to_date = to_date
def search_content(self, content='education', page_size=10, order_by='relevance',
api_key='test'):
self.content = content
self.page_size = page_size
self.order_by = order_by
self.api_key = api_key
# create content
params = {
'from-date': self.from_date,
'to_date': self.to_date,
'order-by': self.order_by,
'page-size': self.page_size,
'q': self.content,
'api': self.api_key
}
# create content
content = theguardian_content.Content(**params)
pdb.set_trace()
json_content = content.get_content_response()
# actual response
return json_content
def data_to_csv(self, json_content):
self.json_content = json_content
with open('guardian_data.csv','w') as csv_file:
writer = csv.writer(csv_file, delimiter=',')
writer.writerow(["webUrl", "webPublicationDate", "webTitle", "sectionName",
"apiUrl", "id", "isHosted", "sectionId", "type", "pillarId", "pillarName"])
for result in json_content['response']['results']:
writer.writerow(
result["webUrl"],
result["webPublicationDate"],
result["webTitle"],
result["sectionName"],
result["apiUrl"],
result["id"],
result["isHosted"],
result["sectionId"],
result["type"],
result["pillarId"],
result["pillarName"]
)
Your data_to_csv is incomplete since you are accessing it as a class method. You should define it as data_to_csv(self,json_content) and pass the result of search content to it, like tg.data_to_csv(tg.search_content())
Take a good look at this doc page, https://docs.python.org/2/tutorial/classes.html
I have created a package to quickly transform datas using pandas and xlsxwriter.
This worked pretty well and I did a few functions successfully. But recently I've hit a wall:
For a few functions I need to define variables first but they are not basic types (list, tuple, str etc.) but for instance a dataframe. I've looked into global variables and saw they're are not recommanded (and wouldn't know where to put them) and I also looked into classes but I don't know how to solve my problem using them. I've also tried creating an empty dataframe but got an empty dataframe after the function.
What I'm trying to do is a read function with pandas for .csv or .xlsx and a function for saving with Xlsxwriter engine.
The goal is to change as little as possible in the code to transform data frequently and rapidly (e.g. i have functions doing LEFT,RIGHT like in Excel or even MIDDLE with column numbers) and have an easy and short code in main.py.
Here is the stripped down version of my code which uses 2 python files (main.py and format_operations.py). I have added commentaries where I'm having issues.
Thanks in advance for your help!
"""
main.py
"""
import format_operations as tbfrm #import another python file in the same folder
import pandas as pd
import numpy as np
import xlsxwriter.utility
#file settings
file_full_path= "C:/Tests/big_data.xlsx"
file_save_to= "C:/Tests/Xlsxwriter.xlsx"
sheet_name_save_to= "Xlswriter"
dfname = ??? #I need to create the variable but I don't know how
tbfrm.FCT_universal_read(dfname,file_full_path) #CAN'T GET IT TO WORK
#column operations and formatting
columns_numeric = [3,6] # (with pandas) list of columns with number values by iloc number, starts at 0 which is column A in Excel
tbfrm.FCT_columns_numeric(dfname,columns_numeric) #example of a WORKING function (if dfname is defined)
#write with Xlsxwriter engine
XLWRITER_DF = ??? #same problem as before, how to create the variable?
workbookvarname = ??? #same here
worksheetvarname = ??? # same here
tbfrm.FCT_df_xlsxwriter(XLWRITER_DF,dfname,file_save_to,sheet_name_save_to,workbookvarname,worksheetvarname) #CAN'T GET IT TO WORK
#### WORKING piece of code I want to execute after saving with Xlsxwriter engine ####
worksheet.set_zoom(80)
# Conditional formatting
color_range_1 = "J1:J{}".format(number_rows+1)
FORMAT1 = workbook.add_format({'bg_color': '#FFC7CE','font_color': '#9C0006'})
FORMAT2 = workbook.add_format({'bg_color': '#C6EFCE','font_color': '#006100'})
worksheet.conditional_format(color_range_1, {'type': 'bottom','value': '5','format': FORMAT1})
worksheet.conditional_format(color_range_1, {'type': 'top','value': '5','format': FORMAT2})
Other file:
"""
format_operations.py
"""
import pandas as pd
import numpy as np
import xlsxwriter.utility
def FCT_universal_read(dfname,file_full_path):
if ".xls" in file_full_path:
dfname = pd.read_excel(file_full_path) #optional arguments:sheetname='Sheet1', header=0 , dtype=object to preserve values
if ".csv" in file_full_path:
dfname = pd.read_csv(file_full_path)
# save file with XLSXWriter engine for additional options to pandas
def FCT_df_xlsxwriter(XLWRITER_DF,dfname,file_save_to,sheet_name_save_to,workbookvarname,worksheetvarname):
XLWRITER_DF = pd.ExcelWriter(file_save_to, engine='xlsxwriter')
dfname.to_excel(XLWRITER_DF, sheet_name=sheet_name_save_to,encoding='utf-8')
workbookvarname = XLWRITER_DF.book
worksheetvarname = XLWRITER_DF.sheets[sheet_name_save_to]
#format as numbers
def FCT_columns_numeric(dfname,columns_numeric):
for x in columns_numeric:
dfname.iloc[:,x] = pd.to_numeric(dfname.iloc[:,x])
Your FCT_universal_read function should not modify a dataframe but instead return a new one:
def FCT_universal_read(file_full_path):
if file_full_path.split('.')[-1] == "xls":
df = pd.read_excel(file_full_path) #optional arguments:sheetname='Sheet1', header=0 , dtype=object to preserve values
if file_full_path.split('.')[-1] == "csv":
df = pd.read_csv(file_full_path)
return df
And in your main, do:
dfname = tbfrm.FCT_universal_read(file_full_path)
Same answer for FCT_df_xlsxwriter, you should rewrite it with a return so that you can do:
XLWRITER_DF, workbookvarname,worksheetvarname = tbfrm.FCT_df_xlsxwriter(dfname,file_save_to,sheet_name_save_to)
To grasp how python is dealing with the arguments you pass to a function, you should read these blog posts:
https://jeffknupp.com/blog/2012/11/13/is-python-callbyvalue-or-callbyreference-neither/
https://robertheaton.com/2014/02/09/pythons-pass-by-object-reference-as-explained-by-philip-k-dick/
You need to update FCT_universal_read so that it returns the dataframe you want. There is no need to define the dataframe outside the function, simply create and return it
df = FCT_universal_read('/your/file/path')
def FCT_universal_read(file_full_path):
if ".xls" in file_full_path:
df = pd.read_excel(file_full_path) #optional arguments:sheetname='Sheet1', header=0 , dtype=object to preserve values
return df
if ".csv" in file_full_path:
df = pd.read_csv(file_full_path)
return df
Thanks so much to both of you !! I get the logic now :)! Thanks also for the documentation.
I sucessfully managed to do both functions. I had been struggling for several hours.
I like the .split function that you used which ensures the script only looks at the extension.
I updated FCT_xlsxwriter and FCT_universal_read as you were saying. Here are both functions corrected:
'''
format_operations.py
'''
def FCT_universal_read(file_full_path):
if "xls" in file_full_path.split('.')[-1]:
dfname = pd.read_excel(file_full_path) #example: C:/Tests/Bigdata.xlsx
return dfname
if "csv" in file_full_path.split('.')[-1]:
dfname = pd.read_csv(file_full_path)
return dfname
def FCT_df_xlsxwriter(dfname,file_save_to,sheet_name_save_to):
XLWRITER_DF = pd.ExcelWriter(file_save_to, engine='xlsxwriter')
dfname.to_excel(XLWRITER_DF, sheet_name=sheet_name_save_to,encoding='utf-8')
workbook = XLWRITER_DF.book
worksheet = XLWRITER_DF.sheets[sheet_name_save_to]
return XLWRITER_DF,workbook,worksheet
Here is how I call the two functions:
'''
main.py
'''
import format_operations as tbfrm
import pandas as pd
import xlsxwriter.utility
#settings
file_full_path= "C:/Tests/big_data.xlsx"
file_save_to= "C:/Tests/Xlsxwriter.xlsx"
sheet_name_save_to= "Xlswriter"
#functions
FILE_DF = tbfrm.FCT_universal_read(file_full_path)
XLWRITER_DF,workbook,worksheet = tbfrm.FCT_df_xlsxwriter(FILE_DF,file_save_to,sheet_name_save_to)