Pandas DataFrame Accessor Type Hints - python-3.x

Pandas allows you to extend its DataFrame class by using the pd.api.extensions.register_dataframe_accessor() decorator.
While this is functional, it doesn't offer any additional type hinting capabilities.
For example, I would expect the following to type check OK and even provide type hints
import pandas as pd
#pd.api.extensions.register_dataframe_accessor('dataset')
class Extension:
def __init__(self, df: pd.DataFrame):
self._df = df
def foo(self, bar) -> str:
return "foobar";
foo = pd.DataFrame({"foo":["bar"]})
foo.dataset.foo("bar")
^
No Suggestions
How can I get dataframe accessors to provide autocomplete?

This can be done somewhat hackishly using typing.TYPE_CHECKING and a bit of inheritance.
from typing import TYPE_CHECKING
import pandas as pd
#pd.api.extensions.register_dataframe_accessor('dataset')
class Extension:
def __init__(self, df: pd.DataFrame):
self._df = df
def foo(self, bar) -> str:
return "foobar";
if TYPE_CHECKING:
class DataFrame(pd.DataFrame):
dataset: Extension
foo: 'DataFrame' = pd.DataFrame({"foo":["bar"]})
# ^ you have to do this every time you transform the DataFrame
foo.dataset.foo("bar")
# ^ autocomplete is now provided
Unfortunately, PyCharm does not check the __annotations__ dictionary, or really do any dynamic type checking, so there doesn't appear to be any more universal solutions.

Related

attrs convert list[str] to list[float]

Given the following scenario:
import attrs
#attrs.define(kw_only=True)
class A:
values: list[float] = attrs.field(converter=float)
A(values=["1.1", "2.2", "3.3"])
which results in
*** TypeError: float() argument must be a string or a real number, not 'list'
Obviously it's due to providing the whole list to float, but is there a way to get attrs do the conversion on each element, without providing a custom converter function?
As far as I know, attrs doesn't have a built-in option to switch conversion or validation to "element-wise", the way Pydantic's validators have the each_item parameter.
I know you specifically did not ask for a converter function, but I don't really see much of an issue in defining one that you can reuse as often as you need to. Here is one way to implement a converter for your specific case:
from attrs import define, field
from collections.abc import Iterable
from typing import Any
def float_list(iterable: Iterable[Any]) -> list[float]:
return [float(item) for item in iterable]
#define
class A:
values: list[float] = field(converter=float_list)
if __name__ == '__main__':
a = A(values=["1.1", "2.2", "3.3"])
print(a)
It is not much of a difference to your example using converter=float.
The output is of course A(values=[1.1, 2.2, 3.3]).
You could even have your own generic converter factory for arbitrary convertible item types:
from attrs import define, field
from collections.abc import Callable, Iterable
from typing import Any, TypeAlias, TypeVar
T = TypeVar("T")
ItemConv: TypeAlias = Callable[[Any], T]
ListConv: TypeAlias = Callable[[Iterable[Any]], list[T]]
def list_of(item_type: ItemConv[T]) -> ListConv[T]:
def converter(iterable: Iterable[Any]) -> list[T]:
return [item_type(item) for item in iterable]
return converter
#define
class B:
foo: list[float] = field(converter=list_of(float))
bar: list[int] = field(converter=list_of(int))
baz: list[bool] = field(converter=list_of(bool))
if __name__ == '__main__':
b = B(
foo=range(0, 10, 2),
bar=["1", "2", 3.],
baz=(-1, 0, 100),
)
print(b)
Output: B(foo=[0.0, 2.0, 4.0, 6.0, 8.0], bar=[1, 2, 3], baz=[True, False, True])
The only downside to that approach is that the mypy plugin for attrs (for some reason) can not handle this type of converter function and will complain, unless you add # type: ignore[misc] to the field definition in question.
You could use cattrs, which is a companion library for attrs for transforming data.
So after a pip install cattrs:
from functools import partial
import attrs
from cattrs import structure
#attrs.define(kw_only=True)
class A:
values: list[float] = attrs.field(converter=partial(structure, cl=list[float]))
print(A(values=["1.1", "2.2", "3.3"]))

Why wont this work? - def (function) not being called from main()

I need to be able to use classes, but trying to just get my simple code to work
import pandas as pd, numpy as np
class OutOfCountry7ViewModel():
def pandas_conversion(self):
#from csv import readers
deImport = pd.read_csv("ooc-exceptions.csv")
d1 = pd.read_csv("CS_Out_Of_Country.csv", encoding='windows-1252', parse_dates=True)
d2 = pd.read_csv("sccm-devices.csv", encoding='windows-1252')
d3 = pd.read_csv("CTLDAPRawData.csv", encoding='windows-1252')
#pandas join magic
lj_df1 = pd.merge(d1, d2, left_on="ComputerName", right_on="Name", how="left")
lj_df2 = pd.merge(d2, d3, left_on="PrimaryUser", right_on="Employee Number", how="left")
#lj_df = plj_df1d.join(lj_df2, lsuffix=)
df = (lj_df1)
#print(df)
df.to_csv('CS_Out_of_country_tabl.csv', index=False,header=df.columns, encoding='utf-8')
csv = 'CS_Out_of_country_tabl.csv'
return csv
def main():
pandas_conversion(self)
if __name__ == '__main__':
main()
i keep getting an error, NameError: name 'pandas_conversion' is not defined
Are you trying to do something like this? -
import pandas as pd, numpy as np
class OutOfCountry7ViewModel():
def pandas_conversion(self,csv):
...
def main(self):
self.pandas_conversion(csv)
if __name__ == '__main__':
some_object = OutOfCountry7ViewModel()
some_object.main()
This should work:
a = OutOfCountry7ViewModel()
a.pandas_conversion()
Hope this helped!
Try to remember the semantics and indentation of python.
Unused import numpy
Class/Parent Class has no (), Line 3
class OutOfCountry7ViewModel(): #wrong
class OutOfCountry7ViewModel: #right
There is no need of ()
df = (lj_df1)
#if you using some func then you miss that func name
If you're defining a method in the class you've to add instance self
def main(self):
pandas_conversion(self) #wrong calling func with parameter
I think your code is wrong because in PyCharm, it says def pandas_conversion(self): may be static.
So, your code is incomplete, there is something missing that we can't find.

Break import cycle while type checking

I have split a large class implementation into different packages [1], and have used an import inside a method body to avoid a compilation cycle, as follows:
# model.py
class MyInt:
def __init__(self, value: int):
self.value = value
def is_prime(self) -> bool:
from methods import is_prime
return is_prime(self)
# methods.py
from model import MyInt
def is_prime(x: MyInt) -> bool:
# TODO: actually implement this
return x.value == 2 or x.value % 2 == 1
However, pytype is not happy about this, failing to find the pyi file when reaching the import cycle:
File "/home/bkim/Projects/mwe/model.py", line 6, in is_prime: Couldn't import pyi for 'methods' [pyi-error]
Can't find pyi for 'model', referenced from 'methods'
How can I avoid this and still get type-checking?
[1] I've done this with just one tiny, utility method, actually. No need to yell about splitting a class across multiple packages.
This solution uses typing.TYPE_CHECKING, to have one behavior during type checking and another during runtime:
import typing
class MyInt:
def is_prime(self) -> bool:
if typing.TYPE_CHECKING:
return False
from methods import is_prime
return is_prime(self)
Curiously, using from typing import TYPE_CHECKING doesn't work, which may be a bug?

How to declare numpy array of particular type as type in dataclass

What I have:
I am creating a dataclass and I am stating the types of its elements:
class Task():
n_items: int
max_weight: int
max_size: int
items: numpy.array(Item) # incorrect way of doing it
What I want to do
I'd like to declare, that items will be a numpy array of obejcts of class "Item"
You can put ndarray:
import numpy as np
class Task():
n_items: int
max_weight: int
max_size: int
items: np.ndarray
You have to use ndarray class type:
import numpy as np
class Task():
n_items: int
max_weight: int
max_size: int
items: np.ndarray[<shapeType>, <convertedNumpyGenericType>]
Where <shapeType> is the type of values defining the shape of the array (probably int) and <convertedNumpyGenericType> defines the array data's type. Be careful that you have to "convert" numpy generic types into python ones. You may want to use np.dtype[<generic>] with <generic> the generic numpy type (e.g np.float64)
If you want to set a default value (inside the field dataclass function) you have to do as follows:
items: np.ndarray[_, _] = field(default_factory=lambda: np.zeros(shape=<int>, dtype=<type>))
You can use the nptyping package, which offers type hints specifically for Numpy data types.
Unless you want to create a custom Numpy container, the best you can do is to denote your array as a container of typing.Any objects, since support for types beyond the ones mentioned here is lacking.
from nptyping import NDArray, Shape
from typing import Any
import numpy as np
class Item:
pass
class Foo:
def __init__(self, bar: NDArray[Shape["1,2"], Any]):
self.bar = bar
if __name__ == '__main__':
item = Item()
foo = Foo(bar=np.array([Item(), Item()], dtype=Item))
print(foo.bar)
Running this will yield something like
[<__main__.Item object at 0x7f13f0dd9e80>
<__main__.Item object at 0x7f13f0dd9040>]

How to properly pass self in stacked decortors on class methods in python?

First, I would like to have a class method, that is concerened only with manipulating a dataframe column. So that i can really focus on the manipulation itself rather than the background stuff. This background stuff is actually applying this simple function over specified columns (e.g. all numeric ones, stated explicitly by their column name)
To seperate this from the nasty bits, i tryed using decorators and actually succeded.
However the difficulty arose as i wanted to use a second decorator that is in fact a plotting method for each of those manipulated columns to keep track on the manipulations.
The code that you find below is a working version and simplified:
plotter is merely printing each columns name rather than actually plotting it
Note the commented self, that allows this code to work propperly. I dont understand why.
import pandas as pd
import numpy as np
class test_class:
def __init__(self):
self.df = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list('ABCD'))
def plotter(fn):
def printer(self, **kwargs):
print('printer call')
for col in kwargs['column']:
print(col)
return fn(self, **kwargs) # this self actually allows to applyer to referencne self, otherwise: applyer positional argument missing
return printer
def wrapapply(fn):
def applyer(self, **kwargs):
print('applyer call')
fnkwargs = {k: v for k, v in kwargs.items() if k != 'column'} # clean up the decorators arguments
self.df[kwargs['column']] = pd.DataFrame.apply(self.df[kwargs['column']], func=fn, axis=0, **fnkwargs)
return applyer
#plotter
#wrapapply
def norm(column):
return (column - np.mean(column)) / np.std(column)
if __name__ == '__main__':
a = test_class()
a.norm(column=['A', 'B'])
a.norm(column=['D'])
print(a.df)
The result i expect is
a silent inplace manipulation of all columns A, B, D in the
Dataframe,
each of the columnames of a call must be printed by a seperate
decorator function (as in my application, this is in fact a plotting
method)

Resources