Alternative solution not using closures - python-3.x

I have a class Data which I want to filter using the below api.
# Example: filter using where
inpt = {"a":np.array((1,2,3,4,2,5,6,2,3,3,2,1)),
"b":np.random.rand(12)}
data = (Data(inpt)
.where(col("a").equals(3)) # This is how where should be called.
)
data
where is a method from class Data
col("a").equals(3) is syntactic sugar for inpt["a"] == 3
I am able to achieve this using another class Expr which handles all the functionality within Data.where() using closures. Reason for this being that Expr doesn't have access to Data.
Questions: can someone provide me with an alternative approach not involving closures. My goal is to learn new approaches / directions.
Here is my code:
from __future__ import annotations
from typing import Dict, Any
import numpy as np
class Data:
def __init__(self, data: Dict):
self._data = data
def where(self, e: Expr) -> Data:
idx = e.collect(self)
for k,v in self._data.items():
self._data[k] = v[idx]
return self
def __repr__(self):
return str(self._data)
class Expr:
def __init__(self):
self.fs = []
def col(self, s: str) -> Self:
f = lambda x: x._data[s]
self.fs.append(f)
return self
def equals(self, el: Any) -> Self:
f = lambda x: x == el
self.fs.append(f)
return self
def collect(self, x: Data) -> Data:
args = x
for f in self.fs:
args = f(args)
return args
def col(s: str) -> Expr:
return Expr().col(s)

I don't really understand the point. Maybe if you give an example of what you're actually trying to do?
If you already know the right key, you can just check directly. If you want to find the right key, the pythonic way is to use a list comprehension.
In [2]: inpt = {
...: "a": (1,2,3,4,2,5,6,2,3,3,2,1),
...: "b": 3,
...: }
In [3]: inpt["a"] == 3
Out[3]: False
In [4]: inpt["b"] == 3
Out[4]: True
In [5]: [key for key, value in inpt.items() if value == 3][0]
Out[5]: 'b'
In [8]: from typing import Sequence
In [9]: [key for key, value in inpt.items() if isinstance(value, Sequence) and 3 in value][0]
Out[9]: 'a'

Related

Overriding the `[]` operator in a dictionary of dictionaries

I am trying to implement a class which provides a dictionary with a default value:
from copy import deepcopy
class Dict:
def __init__(self, default) -> None:
self.default = default
self.values = {}
def __getitem__(self, key):
return self.values[key] if key in self.values else deepcopy(self.default)
def __setitem__(self, key, value):
self.values[key] = value
It works as expected when the default value is "plain" (42 in the example below):
KEY = 'k'
d = Dict(42)
print(d[KEY]) # prints 42
d[KEY] = 53
print(d[KEY]) # prints 53
But it doesn't work as expected when the default value is by itself a Dict object:
KEY1 = 'k1'
KEY2 = 'k2'
d = Dict(Dict(42))
print(d[KEY1][KEY2]) # prints 42
d[KEY1][KEY2] = 53
print(d[KEY1][KEY2]) # prints 42
I have tried to debug that by adding various printouts within the class functions, but I haven't been able to figure it out.
What exactly am I doing wrong here?
The immediate problem is in your __getitem__ method:
def __getitem__(self, key):
return self.values[key] if key in self.values else deepcopy(self.default)
Because you're only returning a value here, but not actually setting it, the returned value isn't useful. If you request a key that doesn't exist, the method is equivalent to:
def __getitem__(self, key):
return deepcopy(self.default)
So when you write:
d[KEY1][KEY2] = 53
You're successfully setting a value for KEY2, but only in the dictionary returned by __getitem__. You probably want to use the dictionary setdefault method, which will set the key in self.values if it doesn't exist (in addition to returning it):
def __getitem__(self, key):
return self.values.setdefault(key, deepcopy(self.default))
With this implementation:
>>> KEY1 = 'k1'
>>> KEY2 = 'k2'
>>> d = Dict(Dict(42))
>>> print(d[KEY1][KEY2])
42
>>> d[KEY1][KEY2] = 53
>>> print(d[KEY1][KEY2])
53
But as I mentioned in my comment, a better solution is just to use the existing defaultdict implementation:
>>> from collections import defaultdict
>>> d = defaultdict(lambda: defaultdict(lambda: 42))
>>> d[KEY1][KEY2]
42
>>> d[KEY1][KEY2]=53
>>> d[KEY1][KEY2]
53
(The difference between defaultdict and the class you implemented is that the default must be a callable. Here's I've used lambda expressions, but you could also use actual functions, classes, etc).
Since you are using deepcopy so it creates a copy without reference.
You have to return the object without deepcopy.
def __getitem__(self, key):
return self.values[key] if key in self.values else self.default
Now it should work as expected.

ListError when mapping Pydantic class to csv

I am trying to use pydantic classes to represent records of a CSV. Some fields in this CSV represent things like numbers, dates, encoded lists that are better handled as such. So I assign the appropriate type to the coresponding pydantic field and rely on pydantic to cast the string to the type. Unfortunately this fails for lists.
from typing import List
import csv
from pydantic import BaseModel
class Foo(BaseModel):
a: int
b: List[str]
c: str
# Write class to CSV
x = Foo(a=1, b=["hello", "world"], c="foo")
with open("/tmp/test.csv", "w") as f:
writer = csv.DictWriter(f, fieldnames=x.dict().keys())
writer.writeheader()
writer.writerow(x.dict())
# Try to load the class back from CSV
with open("/tmp/test.csv") as f:
reader = csv.DictReader(f)
y = Foo(**next(reader))
I expect that y would be instance with the same values as x, but instead it crashes with ListError. This code does succeed in outputting /tmp/test.csv, and its contents are:
a,b,c
1,"['hello', 'world']",foo
How can I solve this problem?
So, here is how I would do what you want to do:
from typing import List
from pydantic import BaseModel, Field, validator
import json
class Foo(BaseModel):
bar: int = None
baz: List[pydantic.StrictStr] = Field(default_factory=list)
#validator('baz', pre=True)
def _maybe_json(cls, v):
if isinstance(v, str):
try:
return json.loads(v)
except json.JSONDecodeError as e:
raise ValueError("not valid JSON") from e
return v
def to_csv_row(self):
row = self.dict()
row["baz"] = json.dumps(row["baz"])
return row
Note how StrictStr handles this:
In [4]: Foo(baz='["a"]')
Out[4]: Foo(bar=None, baz=['a'])
In [5]: Foo(baz='[1]')
---------------------------------------------------------------------------
ValidationError Traceback (most recent call last)
Input In [22], in <cell line: 1>()
----> 1 Foo(baz='[1]')
File ~/miniconda3/envs/maze-etl/lib/python3.9/site-packages/pydantic/main.py:331, in pydantic.main.BaseModel.__init__()
ValidationError: 1 validation error for Foo
baz -> 0
str type expected (type=type_error.str)
But if you don't want that just use List[str]
And just use it like:
In [10]: foo = Foo(bar=1, baz=['a','b','c'])
In [11]: foo
Out[11]: Foo(bar=1, baz=['a', 'b', 'c'])
In [12]: foo.to_csv_row()
Out[12]: {'bar': 1, 'baz': '["a", "b", "c"]'}
The solution I found was to create a validator that checks the value being passed, and if it's a string, tries to eval it to a Python list.
class Foo(BaseModel):
a: int
b: List[str]
c: str
#validator("b", pre=True)
def eval_list(cls, val):
if isinstance(val, List):
return val
else:
return ast.literal_eval(val)
This can of course potentially allow people to inject Python code via the CSV, and it is possible to construct lists which cannot be reconstructed from their string representation. In my case, the CSVs are all created by me and the lists are simple lists of string, so this limitation is not a problem.

Python: iterate through functions with different arguments

I have a class with three functions:
class MyClass:
def f1(self, int_arg):
return int_arg
def f2(self, list_arg):
return list_arg
def f3(self, int_arg, list_arg):
return int_arg + sum(list_arg)
The value of the arguments of these functions is fixed:
int_arg = 1
list_arg = [1,2]
Now, I want to iterate through the functions of my class and execute them, in the following way:
for f in ['f1', 'f2', 'f3']:
out = getattr(MyClass(), f)(<arguments>)
Now, what is a smart way of dealing with the fact that different functions have different arguments?
In short, you want to know which parameters a function receives.
For that you may use inspect.signature:
from inspect import signature
def sub_dict(d, keys):
return dict((k, d[k]) for k in keys)
int_arg = 1
list_arg = [1,2]
params = dict(int_arg=int_arg, list_arg=list_arg)
for f in ['f1', 'f2', 'f3']:
func = getattr(MyClass(), f)
out = func(**sub_dict(params, signature(func).parameters))

Python regular expressions: Better way to handle non-matches?

When I deal with regular expressions, my code is littered with conditionals so as to not create exceptions when a pattern is not found:
m = some_compiled_pattern.match(s)
if m:
x = m.groups()
do_something_with(x)
m = some_other_compiled_pattern.search(s):
if m:
y = m.groupdict()
else:
y = {}
do_something_else_with(y)
Isn't there a better (less verbose) way to handle such exceptions?
You might find this class useful to reduce most of those if-no-match handling to a one line.
class Returns:
"""
Makes an object that pretends to have all possible methods,
but returns the same value (default None) no matter what this method,
or its arguments, is.
"""
def __init__(self, return_val=None):
self.return_val = return_val
def the_only_method_there_is(*args, **kwargs):
return return_val
self.the_only_method_there_is = MethodType(the_only_method_there_is, self)
def __getattr__(self, item):
if not item.startswith('_') and item not in {'return_val', 'the_only_method_there_id'}:
return self.the_only_method_there_is
else:
return getattr(self, item)
Example use:
>>> import re
>>> p = re.compile(r'(\d+)\W+(\w+)')
>>>
>>> # when all goes well...
>>> m = p.search('The number 42 is mentioned often')
>>> num, next_word = m.groups()
>>> num, next_word
('42', 'is')
>>>
>>> # when the pattern is not found...
>>> m = p.search('No number here')
>>> assert m is None # m is None so...
>>> num, next_word = m.groups() # ... this is going to choke
Traceback (most recent call last):
...
AttributeError: 'NoneType' object has no attribute 'groups'
>>>
>>> # Returns to the rescue
>>> num, next_word = (p.search('No number here') or Returns((None, 'default_word'))).groups()
>>> assert num is None
>>> next_word
'default_word'
EDIT: See this gist for a longer discussion (and alternate but similar solution) of this problem.

What's the underlying implementation for most_common method of Counter?

I found a pyi file which has the following def
def most_common(self, n: Optional[int] = ...) -> List[Tuple[_T, int]]: ...
How could this happen? List is not defined, and no implementation?
Just highlight some valuable suggestions here for followers:
List is imported from the typing module; it's not the same thing as list. The .pyi file doesn't need to import it because stub files are never executed; they just have to be syntactically valid Python
If you use from future import annotations, you won't have to import typing to use List et al. in function annotations in .py files, either, since function annotations will be treated as string literals. (Starting in Python 4, that will be the default behavior. See PEP 563 for details.)
You are looking at the pyi file which is used solely for annotations. It is never executed by the Python interpreter. You can learn more about pyi files by reading PEP484.
Using a debugger, put a breakpoint on the line where you call most_commonand then step into the method.
Python 3.7 implementation.
...\Lib\collections\__init__.py:
def most_common(self, n=None):
'''List the n most common elements and their counts from the most
common to the least. If n is None, then list all element counts.
>>> Counter('abcdeabcdabcaba').most_common(3)
[('a', 5), ('b', 4), ('c', 3)]
'''
# Emulate Bag.sortedByCount from Smalltalk
if n is None:
return sorted(self.items(), key=_itemgetter(1), reverse=True)
return _heapq.nlargest(n, self.items(), key=_itemgetter(1))
_heapq.nlargest (in ...\Lib\heapq.py) implementation:
def nlargest(n, iterable, key=None):
"""Find the n largest elements in a dataset.
Equivalent to: sorted(iterable, key=key, reverse=True)[:n]
"""
# Short-cut for n==1 is to use max()
if n == 1:
it = iter(iterable)
sentinel = object()
if key is None:
result = max(it, default=sentinel)
else:
result = max(it, default=sentinel, key=key)
return [] if result is sentinel else [result]
# When n>=size, it's faster to use sorted()
try:
size = len(iterable)
except (TypeError, AttributeError):
pass
else:
if n >= size:
return sorted(iterable, key=key, reverse=True)[:n]
# When key is none, use simpler decoration
if key is None:
it = iter(iterable)
result = [(elem, i) for i, elem in zip(range(0, -n, -1), it)]
if not result:
return result
heapify(result)
top = result[0][0]
order = -n
_heapreplace = heapreplace
for elem in it:
if top < elem:
_heapreplace(result, (elem, order))
top, _order = result[0]
order -= 1
result.sort(reverse=True)
return [elem for (elem, order) in result]
# General case, slowest method
it = iter(iterable)
result = [(key(elem), i, elem) for i, elem in zip(range(0, -n, -1), it)]
if not result:
return result
heapify(result)
top = result[0][0]
order = -n
_heapreplace = heapreplace
for elem in it:
k = key(elem)
if top < k:
_heapreplace(result, (k, order, elem))
top, _order, _elem = result[0]
order -= 1
result.sort(reverse=True)
return [elem for (k, order, elem) in result]

Resources