compute distance for data frame columns from python list

compute distance for data frame columns from python list - apache-spark

I have have a dataframe
data=sqlContext.createDataFrame([[33.603699,-83.967819[43.609422,-84.188726],[40.751800537,-74.066200256]],['a','b'])
and I have a list of lat/lon pairs. For each lat/lon pair in the data I want to compute the distance between each lat/lon
pair in the list. I am using code form this answer as my distance function
How to sum distances between data points in a dataset using (Py)Spark?
lat_lon_list=[[26.145677, -80.120355],[26.179337, -80.25151600000001],[26.188919, -98.21469499999999], [26.641769, -81.875031]]
def dist_2(long_x, lat_x, long_y, lat_y):
z0=np.sin(np.radians(lat_y))
z1=np.cos(np.radians(lat_y))
z3=np.radians(long_y)
return F.acos(F.sin(F.toRadians(F.col(long_x)) * z0 + \
F.cos(F.toRadians(F.col(lat_x))) * z1 * \
F.cos(F.toRadians(F.col(long_x))) - z3\
) * F.lit((6371.0)*(0.621371)))
def dist_1(x,y):
return [dist_2(x,y,c[0],c[1]) for c in lat_lon_list]
When i try to compute the distance i get the following error
data.select('a','b',dist_1('a','b')).show()
TypeErrorTraceback (most recent call last)
<ipython-input-53-8ec09912a7b1> in <module>()
24
25
---> 26 data.select('a','b',dist_1('a','b')).show()
/opt/spark/current/python/pyspark/sql/dataframe.py in select(self,
*cols)
859 [Row(name=u'Alice', age=12), Row(name=u'Bob',
age=15)]
860 """
--> 861 jdf = self._jdf.select(self._jcols(*cols))
862 return DataFrame(jdf, self.sql_ctx)
863
/opt/spark/current/python/pyspark/sql/dataframe.py in _jcols(self,
*cols)
714 if len(cols) == 1 and isinstance(cols[0], list):
715 cols = cols[0]
--> 716 return self._jseq(cols, _to_java_column)
717
718 def _sort_cols(self, cols, kwargs):
/opt/spark/current/python/pyspark/sql/dataframe.py in _jseq(self,
cols, converter)
701 def _jseq(self, cols, converter=None):
702 """Return a JVM Seq of Columns from a list of Column
or names"""
--> 703 return _to_seq(self.sql_ctx._sc, cols, converter)
704
705 def _jmap(self, jm):
/opt/spark/current/python/pyspark/sql/column.py in _to_seq(sc, cols,
converter)
57 """
58 if converter:
---> 59 cols = [converter(c) for c in cols]
60 return sc._jvm.PythonUtils.toSeq(cols)
61
/opt/spark/current/python/pyspark/sql/column.py in
_to_java_column(col)
45 jcol = col._jc
46 else:
---> 47 jcol = _create_column_from_name(col)
48 return jcol
49
/opt/spark/current/python/pyspark/sql/column.py in
_create_column_from_name(name)
38 def _create_column_from_name(name):
39 sc = SparkContext._active_spark_context
---> 40 return sc._jvm.functions.col(name)
41
42
/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_gateway.py in __call__(self, *args)
1122
1123 def __call__(self, *args):
-> 1124 args_command, temp_args = self._build_args(*args)
1125
1126 command = proto.CALL_COMMAND_NAME +\
/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_gateway.py in _build_args(self, *args)
1086 def _build_args(self, *args):
1087 if self.converters is not None and
len(self.converters) > 0:
-> 1088 (new_args, temp_args) = self._get_args(args)
1089 else:
1090 new_args = args
/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_gateway.py in _get_args(self, args)
1073 for converter in
self.gateway_client.converters:
1074 if converter.can_convert(arg):
-> 1075 temp_arg = converter.convert(arg,
self.gateway_client)
1076 temp_args.append(temp_arg)
1077 new_args.append(temp_arg)
/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_collections.py in convert(self, object,
gateway_client)
499 java_list = ArrayList()
500 for element in object:
--> 501 java_list.add(element)
502 return java_list
503
/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_gateway.py in __call__(self, *args)
1122
1123 def __call__(self, *args):
-> 1124 args_command, temp_args = self._build_args(*args)
1125
1126 command = proto.CALL_COMMAND_NAME +\
/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_gateway.py in _build_args(self, *args)
1086 def _build_args(self, *args):
1087 if self.converters is not None and
len(self.converters) > 0:
-> 1088 (new_args, temp_args) = self._get_args(args)
1089 else:
1090 new_args = args
/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_gateway.py in _get_args(self, args)
1073 for converter in
self.gateway_client.converters:
1074 if converter.can_convert(arg):
-> 1075 temp_arg = converter.convert(arg,
self.gateway_client)
1076 temp_args.append(temp_arg)
1077 new_args.append(temp_arg)
/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_collections.py in convert(self, object,
gateway_client)
510 HashMap = JavaClass("java.util.HashMap",
gateway_client)
511 java_map = HashMap()
--> 512 for key in object.keys():
513 java_map[key] = object[key]
514 return java_map
TypeError: 'Column' object is not callable
Any help would be appreciated.

This is because your function returns a list. You can unpack:
data.select('a','b', *dist_1('a','b'))
or combine:
data.select(['a','b'] + dist_1('a','b'))

Related

jax.lax.fori_loop Abstract tracer value encountered where concrete value is expected

I've a JAX loop that looks like this where inside the step function I use min between the two arguments
import jax
def step(timestep: int, order: int = 4) -> int:
order = min(timestep + 1, order)
return order
num_steps = 10
order = 100
order = jax.lax.fori_loop(0, num_steps, step, order)
The above code fails with a jax._src.errors.ConcretizationTypeError. This is is the full stacktrace:
WARNING:jax._src.lib.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
---------------------------------------------------------------------------
UnfilteredStackTrace Traceback (most recent call last)
<ipython-input-4-9ec280f437cb> in <module>
2 order = 100
----> 3 order = jax.lax.fori_loop(0, num_steps, step, order)
16 frames
/usr/local/lib/python3.8/dist-packages/jax/_src/traceback_util.py in reraise_with_filtered_traceback(*args, **kwargs)
161 try:
--> 162 return fun(*args, **kwargs)
163 except Exception as e:
/usr/local/lib/python3.8/dist-packages/jax/_src/lax/control_flow/loops.py in fori_loop(lower, upper, body_fun, init_val)
1691
-> 1692 (_, result), _ = scan(_fori_scan_body_fun(body_fun), (lower_, init_val),
1693 None, length=upper_ - lower_)
/usr/local/lib/python3.8/dist-packages/jax/_src/traceback_util.py in reraise_with_filtered_traceback(*args, **kwargs)
161 try:
--> 162 return fun(*args, **kwargs)
163 except Exception as e:
/usr/local/lib/python3.8/dist-packages/jax/_src/lax/control_flow/loops.py in scan(f, init, xs, length, reverse, unroll)
258 # necessary, a second time with modified init values.
--> 259 init_flat, carry_avals, carry_avals_out, init_tree, *rest = _create_jaxpr(init)
260 new_init_flat, changed = _promote_weak_typed_inputs(init_flat, carry_avals, carry_avals_out)
/usr/local/lib/python3.8/dist-packages/jax/_src/lax/control_flow/loops.py in _create_jaxpr(init)
244 carry_avals = tuple(_map(_abstractify, init_flat))
--> 245 jaxpr, consts, out_tree = _initial_style_jaxpr(
246 f, in_tree, (*carry_avals, *x_avals), "scan")
/usr/local/lib/python3.8/dist-packages/jax/_src/lax/control_flow/common.py in _initial_style_jaxpr(fun, in_tree, in_avals, primitive_name)
59 primitive_name: Optional[str] = None):
---> 60 jaxpr, consts, out_tree = _initial_style_open_jaxpr(
61 fun, in_tree, in_avals, primitive_name)
/usr/local/lib/python3.8/dist-packages/jax/_src/lax/control_flow/common.py in _initial_style_open_jaxpr(fun, in_tree, in_avals, primitive_name)
53 debug = pe.debug_info(fun, in_tree, False, primitive_name or "<unknown>")
---> 54 jaxpr, _, consts = pe.trace_to_jaxpr_dynamic(wrapped_fun, in_avals, debug)
55 return jaxpr, consts, out_tree()
/usr/local/lib/python3.8/dist-packages/jax/_src/profiler.py in wrapper(*args, **kwargs)
313 with TraceAnnotation(name, **decorator_kwargs):
--> 314 return func(*args, **kwargs)
315 return wrapper
/usr/local/lib/python3.8/dist-packages/jax/interpreters/partial_eval.py in trace_to_jaxpr_dynamic(fun, in_avals, debug_info, keep_inputs)
1980 main.jaxpr_stack = () # type: ignore
-> 1981 jaxpr, out_avals, consts = trace_to_subjaxpr_dynamic(
1982 fun, main, in_avals, keep_inputs=keep_inputs, debug_info=debug_info)
/usr/local/lib/python3.8/dist-packages/jax/interpreters/partial_eval.py in trace_to_subjaxpr_dynamic(fun, main, in_avals, keep_inputs, debug_info)
1997 in_tracers_ = [t for t, keep in zip(in_tracers, keep_inputs) if keep]
-> 1998 ans = fun.call_wrapped(*in_tracers_)
1999 out_tracers = map(trace.full_raise, ans)
/usr/local/lib/python3.8/dist-packages/jax/linear_util.py in call_wrapped(self, *args, **kwargs)
166 try:
--> 167 ans = self.f(*args, **dict(self.params, **kwargs))
168 except:
/usr/local/lib/python3.8/dist-packages/jax/_src/lax/control_flow/loops.py in scanned_fun(loop_carry, _)
1607 i, x = loop_carry
-> 1608 return (i + 1, body_fun()(i, x)), None
1609 return scanned_fun
<ipython-input-2-2e3345899235> in step(timestep, order)
1 def step(timestep: int, order: int = 100) -> int:
----> 2 order = min(timestep + 1, order)
3 return order
/usr/local/lib/python3.8/dist-packages/jax/core.py in __bool__(self)
633 def __nonzero__(self): return self.aval._nonzero(self)
--> 634 def __bool__(self): return self.aval._bool(self)
635 def __int__(self): return self.aval._int(self)
/usr/local/lib/python3.8/dist-packages/jax/core.py in error(self, arg)
1266 def error(self, arg):
-> 1267 raise ConcretizationTypeError(arg, fname_context)
1268 return error
UnfilteredStackTrace: jax._src.errors.ConcretizationTypeError: Abstract tracer value encountered where concrete value is expected: Traced<ShapedArray(bool[], weak_type=True)>with<DynamicJaxprTrace(level=1/0)>
The problem arose with the `bool` function.
The error occurred while tracing the function scanned_fun at /usr/local/lib/python3.8/dist-packages/jax/_src/lax/control_flow/loops.py:1606 for scan. This concrete value was not available in Python because it depends on the values of the argument 'loop_carry'.
See https://jax.readthedocs.io/en/latest/errors.html#jax.errors.ConcretizationTypeError
The stack trace below excludes JAX-internal frames.
The preceding is the original exception that occurred, unmodified.
--------------------
The above exception was the direct cause of the following exception:
ConcretizationTypeError Traceback (most recent call last)
<ipython-input-4-9ec280f437cb> in <module>
1 num_steps = 10
2 order = 100
----> 3 order = jax.lax.fori_loop(0, num_steps, step, order)
<ipython-input-2-2e3345899235> in step(timestep, order)
1 def step(timestep: int, order: int = 100) -> int:
----> 2 order = min(timestep + 1, order)
3 return order
ConcretizationTypeError: Abstract tracer value encountered where concrete value is expected: Traced<ShapedArray(bool[], weak_type=True)>with<DynamicJaxprTrace(level=1/0)>
The problem arose with the `bool` function.
The error occurred while tracing the function scanned_fun at /usr/local/lib/python3.8/dist-packages/jax/_src/lax/control_flow/loops.py:1606 for scan. This concrete value was not available in Python because it depends on the values of the argument 'loop_carry'.
See https://jax.readthedocs.io/en/latest/errors.html#jax.errors.ConcretizationTypeError
Everything works fine if instead of using jax.lax.fori_loop i use a simple python loop, but my original code will end up very slow. How can I fix this issue?

Use jax.numpy.minimum in place of min:
def step(timestep: int, order: int = 4) -> int:
order = jax.numpy.minimum(timestep + 1, order)
return order
The reason min does not work is that in the course of executing code within jit, grad, vmap, fori_loop, etc., JAX replaces concrete values with abstract tracers, and Python functions like min don't know how to handle these abstract values. See How to Think in JAX for more background on this.

Color heatmap in different colors by column in dataframe

I have a dataframe which i'm trying to plot in a heatmap.
df = pd.DataFrame(np.random.randint(0,2,size=(10, 5)),
columns=['1', '2','3','4','5'])
cluster = [1,1,2,2,3,3,4,4,5,5]
df['cluster'] = cluster
x_axis_labels = []
for i in range(1, 6):
x_axis_labels.append(i)
fig, ax = plt.subplots(figsize=(4, 10))
from scipy.ndimage import gaussian_filter
np_smooth = gaussian_filter(df, sigma=0.75)
sns.heatmap(np_smooth, cmap="YlGnBu",
xticklabels=x_axis_labels,
yticklabels=False, cbar=False)
plt.show()
And this is the output:
1
Each row of the heatmap represents a row in the df. I would like to color each cluster in a different color.
like in this photo:
2
I've added the following code but it gives me an error. Would be happy for some help!
The additional code:
cmaps = {'1': 'Blues_r', '2': 'Greens_r', '3': 'Blues_r', '4': 'Greens_r', '5': 'Blues_r', '6': 'Greens_r', '7': 'Blues_r','0': 'Greens_r'}
for clus, cmap in cmaps.items():
mask = df.apply(lambda x: x if x['cluster'] == int(clus) else 0, result_type='broadcast',
axis=1).eq(0)
sns.heatmap(np_smooth, mask=mask, cmap=cmap, xticklabels=x_axis_labels, yticklabels=False,
cbar=False, ax=ax)
plt.show()
The error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\numexpr\necompiler.py in evaluate(ex, local_dict, global_dict, out, order, casting, **kwargs)
826 try:
--> 827 compiled_ex = _numexpr_cache[numexpr_key]
828 except KeyError:
KeyError: ('a_value | b_value', (('optimization', 'aggressive'), ('truediv', True)), (('a_value', <class 'numpy.float64'>), ('b_value', <class 'bool'>)))
During handling of the above exception, another exception occurred:
NotImplementedError Traceback (most recent call last)
<ipython-input-191-2f3e7e4bc9aa> in cluster_heatmap(df, plot_name, base_num)
24 ).eq(0)
25 # plot masked heatmap on reusable ax
---> 26 sns.heatmap(np_smooth, mask=mask, cmap=cmap, ax=ax, xticklabels=x_axis_labels, yticklabels=False, cbar=False)
27
28
~\Anaconda3\lib\site-packages\seaborn\_decorators.py in inner_f(*args, **kwargs)
44 )
45 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 46 return f(**kwargs)
47 return inner_f
48
~\Anaconda3\lib\site-packages\seaborn\matrix.py in heatmap(data, vmin, vmax, cmap, center, robust, annot, fmt, annot_kws, linewidths, linecolor, cbar, cbar_kws, cbar_ax, square, xticklabels, yticklabels, mask, ax, **kwargs)
540 plotter = _HeatMapper(data, vmin, vmax, cmap, center, robust, annot, fmt,
541 annot_kws, cbar, cbar_kws, xticklabels,
--> 542 yticklabels, mask)
543
544 # Add the pcolormesh kwargs here
~\Anaconda3\lib\site-packages\seaborn\matrix.py in __init__(self, data, vmin, vmax, cmap, center, robust, annot, fmt, annot_kws, cbar, cbar_kws, xticklabels, yticklabels, mask)
107
108 # Validate the mask and convet to DataFrame
--> 109 mask = _matrix_mask(data, mask)
110
111 plot_data = np.ma.masked_where(np.asarray(mask), plot_data)
~\Anaconda3\lib\site-packages\seaborn\matrix.py in _matrix_mask(data, mask)
86 # This works around an issue where `plt.pcolormesh` doesn't represent
87 # missing data properly
---> 88 mask = mask | pd.isnull(data)
89
90 return mask
~\Anaconda3\lib\site-packages\pandas\core\ops.py in f(self, other, axis, level, fill_value)
2021 # Another DataFrame
2022 pass_op = op if should_series_dispatch(self, other, op) else na_op
-> 2023 return self._combine_frame(other, pass_op, fill_value, level)
2024 elif isinstance(other, ABCSeries):
2025 # For these values of `axis`, we end up dispatching to Series op,
~\Anaconda3\lib\site-packages\pandas\core\frame.py in _combine_frame(self, other, func, fill_value, level)
5086 if ops.should_series_dispatch(this, other, func):
5087 # iterate over columns
-> 5088 return ops.dispatch_to_series(this, other, _arith_op)
5089 else:
5090 result = _arith_op(this.values, other.values)
~\Anaconda3\lib\site-packages\pandas\core\ops.py in dispatch_to_series(left, right, func, str_rep, axis)
1155 raise NotImplementedError(right)
1156
-> 1157 new_data = expressions.evaluate(column_op, str_rep, left, right)
1158
1159 result = left._constructor(new_data, index=left.index, copy=False)
~\Anaconda3\lib\site-packages\pandas\core\computation\expressions.py in evaluate(op, op_str, a, b, use_numexpr, **eval_kwargs)
206 use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b)
207 if use_numexpr:
--> 208 return _evaluate(op, op_str, a, b, **eval_kwargs)
209 return _evaluate_standard(op, op_str, a, b)
210
~\Anaconda3\lib\site-packages\pandas\core\computation\expressions.py in _evaluate_numexpr(op, op_str, a, b, truediv, reversed, **eval_kwargs)
121
122 if result is None:
--> 123 result = _evaluate_standard(op, op_str, a, b)
124
125 return result
~\Anaconda3\lib\site-packages\pandas\core\computation\expressions.py in _evaluate_standard(op, op_str, a, b, **eval_kwargs)
66 _store_test_result(False)
67 with np.errstate(all='ignore'):
---> 68 return op(a, b)
69
70
~\Anaconda3\lib\site-packages\pandas\core\ops.py in column_op(a, b)
1133 def column_op(a, b):
1134 return {i: func(a.iloc[:, i], b.iloc[:, i])
-> 1135 for i in range(len(a.columns))}
1136
1137 elif isinstance(right, ABCSeries) and axis == "columns":
~\Anaconda3\lib\site-packages\pandas\core\ops.py in <dictcomp>(.0)
1133 def column_op(a, b):
1134 return {i: func(a.iloc[:, i], b.iloc[:, i])
-> 1135 for i in range(len(a.columns))}
1136
1137 elif isinstance(right, ABCSeries) and axis == "columns":
~\Anaconda3\lib\site-packages\pandas\core\frame.py in _arith_op(left, right)
5082 # left._binop(right, func, fill_value=fill_value)
5083 left, right = ops.fill_binop(left, right, fill_value)
-> 5084 return func(left, right)
5085
5086 if ops.should_series_dispatch(this, other, func):
~\Anaconda3\lib\site-packages\pandas\core\ops.py in na_op(x, y)
1999
2000 try:
-> 2001 result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs)
2002 except TypeError:
2003 result = masked_arith_op(x, y, op)
~\Anaconda3\lib\site-packages\pandas\core\computation\expressions.py in evaluate(op, op_str, a, b, use_numexpr, **eval_kwargs)
206 use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b)
207 if use_numexpr:
--> 208 return _evaluate(op, op_str, a, b, **eval_kwargs)
209 return _evaluate_standard(op, op_str, a, b)
210
~\Anaconda3\lib\site-packages\pandas\core\computation\expressions.py in _evaluate_numexpr(op, op_str, a, b, truediv, reversed, **eval_kwargs)
112 'b_value': b_value},
113 casting='safe', truediv=truediv,
--> 114 **eval_kwargs)
115 except ValueError as detail:
116 if 'unknown type object' in str(detail):
~\Anaconda3\lib\site-packages\numexpr\necompiler.py in evaluate(ex, local_dict, global_dict, out, order, casting, **kwargs)
827 compiled_ex = _numexpr_cache[numexpr_key]
828 except KeyError:
--> 829 compiled_ex = _numexpr_cache[numexpr_key] = NumExpr(ex, signature, **context)
830 kwargs = {'out': out, 'order': order, 'casting': casting,
831 'ex_uses_vml': ex_uses_vml}
~\Anaconda3\lib\site-packages\numexpr\necompiler.py in NumExpr(ex, signature, **kwargs)
624
625 context = getContext(kwargs, frame_depth=1)
--> 626 threeAddrProgram, inputsig, tempsig, constants, input_names = precompile(ex, signature, context)
627 program = compileThreeAddrForm(threeAddrProgram)
628 return interpreter.NumExpr(inputsig.encode('ascii'),
~\Anaconda3\lib\site-packages\numexpr\necompiler.py in precompile(ex, signature, context)
569 ast = ASTNode('op', value='copy', astKind=ex.astKind, children=(ast,))
570
--> 571 ast = typeCompileAst(ast)
572
573 aliases = collapseDuplicateSubtrees(ast)
~\Anaconda3\lib\site-packages\numexpr\necompiler.py in typeCompileAst(ast)
212 raise NotImplementedError(
213 "couldn't find matching opcode for '%s'"
--> 214 % (ast.value + '_' + retsig + basesig))
215 # First just cast constants, then cast variables if necessary:
216 for i, (have, want) in enumerate(zip(basesig, sig)):
NotImplementedError: couldn't find matching opcode for 'or_bdb'

A dataframe as mask seems to give some errors. You could use mask.values to just grab the values.
The example code below makes the following changes:
the columns of the dataframe are changed from integer to float
the clusters column is left out for the gaussian_filter and for the mask
the mask is calculated by repeating the mask value for the clusters column
the directionary for the colormaps now uses numbers as keys
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randint(0, 2, size=(10, 5)).astype(float),
columns=['1', '2', '3', '4', '5'])
cluster = [1, 1, 2, 2, 3, 3, 4, 4, 5, 5]
df['cluster'] = cluster
x_axis_labels = range(1, 6)
fig, ax = plt.subplots(figsize=(4, 10))
from scipy.ndimage import gaussian_filter
np_smooth = gaussian_filter(df[df.columns[:-1]], sigma=0.75)
cmaps = {1: 'Reds_r', 2: 'Greys_r', 3: 'Blues_r', 4: 'Greens_r', 5: 'Purples_r', 6: 'Greens_r', 7: 'Blues_r',
0: 'Greens_r'}
for clus, cmap in cmaps.items():
mask = np.repeat((df['cluster'] != int(clus)).values.reshape(-1, 1), len(df.columns) - 1, 1)
if not mask.all():
sns.heatmap(np_smooth, mask=mask, cmap=cmap, xticklabels=x_axis_labels, yticklabels=False,
cbar=False, ax=ax)
plt.tight_layout()
plt.show()
PS: To have consistent color ranges, you might use vmin=np_smooth.min(), vmax=np_smooth.max() in the calls to sns.heatmap().

Cannot allocate memory in multiprocessing python

I want to apply my function (f1) to array of numbers (cdr_test) using multiprocessing. My code:
cdr_test = [x for x in range(0, 100000)]
def f1(el):
a = Counter() #make new vector for each cdr
for k,v in d3.items():
if el in v:
a = a + Counter(itertools.product([el], v))
return a
if __name__ == '__main__':
pool = mp.Pool(20)
results = pool.map(f1, cdr_test)
pool.close()
pool.join()
out = open('out.txt', 'w')
for result in results:
for k,v in result.items():
out.write('\t'.join(map(str,k))+"\t"+str(v)+"\n")
out.close()
pool.close()
I have 'cannot allocate memory'. If I use an array of smaller length (100), then everything works.
Stacktrace:
OSError Traceback (most recent call last)
<ipython-input-3-b8dc4a3d12b3> in <module>()
9
10 if __name__ == '__main__':
---> 11 pool = mp.Pool(1000)
12 results = pool.map(f1, cdr_test)
13 #new section
/home/fedorovaad/anaconda3/lib/python3.5/multiprocessing/context.py in Pool(self, processes, initializer, initargs, maxtasksperchild)
116 from .pool import Pool
117 return Pool(processes, initializer, initargs, maxtasksperchild,
--> 118 context=self.get_context())
119
120 def RawValue(self, typecode_or_type, *args):
/home/fedorovaad/anaconda3/lib/python3.5/multiprocessing/pool.py in __init__(self, processes, initializer, initargs, maxtasksperchild, context)
166 self._processes = processes
167 self._pool = []
--> 168 self._repopulate_pool()
169
170 self._worker_handler = threading.Thread(
/home/fedorovaad/anaconda3/lib/python3.5/multiprocessing/pool.py in _repopulate_pool(self)
231 w.name = w.name.replace('Process', 'PoolWorker')
232 w.daemon = True
--> 233 w.start()
234 util.debug('added worker')
235
/home/fedorovaad/anaconda3/lib/python3.5/multiprocessing/process.py in start(self)
103 'daemonic processes are not allowed to have children'
104 _cleanup()
--> 105 self._popen = self._Popen(self)
106 self._sentinel = self._popen.sentinel
107 _children.add(self)
/home/fedorovaad/anaconda3/lib/python3.5/multiprocessing/context.py in _Popen(process_obj)
265 def _Popen(process_obj):
266 from .popen_fork import Popen
--> 267 return Popen(process_obj)
268
269 class SpawnProcess(process.BaseProcess):
/home/fedorovaad/anaconda3/lib/python3.5/multiprocessing/popen_fork.py in __init__(self, process_obj)
18 sys.stderr.flush()
19 self.returncode = None
---> 20 self._launch(process_obj)
21
22 def duplicate_for_child(self, fd):
/home/fedorovaad/anaconda3/lib/python3.5/multiprocessing/popen_fork.py in _launch(self, process_obj)
65 code = 1
66 parent_r, child_w = os.pipe()
---> 67 self.pid = os.fork()
68 if self.pid == 0:
69 try:
OSError: [Errno 12] Cannot allocate memory
Are there ways to solve this?

The code you show is different from the one in the error.
---> 11 pool = mp.Pool(1000)
You are trying to spawn way too many processes, the OS will run out of memory before it can allocate them all.
You don't need this many processes to carry on your job, just use multiprocessing.cpu_count() to know how many CPUs your platform has and spawn a pool of that size.

Python 3; MatplotLib ; Box Plot Error

I am new to python/ pandas and trying to create a boxplot using the iris data set.
Here is my code.:
import pandas as pd
iris_filename = '/Users/pro/Documents/Code/Data Science/Iris/IRIS.csv'
iris = pd.read_csv(iris_filename, header = None,
names= ['sepal_lenght','sepal_width','petal_lenght','petal_width','target'])
plt.boxplot(iris)
I get this error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-20-e190e88674b0> in <module>()
----> 1 plt.boxplot(iris)
/anaconda/lib/python3.5/site-packages/matplotlib/pyplot.py in boxplot(x, notch, sym, vert, whis, positions, widths, patch_artist, bootstrap, usermedians, conf_intervals, meanline, showmeans, showcaps, showbox, showfliers, boxprops, labels, flierprops, medianprops, meanprops, capprops, whiskerprops, manage_xticks, autorange, zorder, hold, data)
2784 whiskerprops=whiskerprops,
2785 manage_xticks=manage_xticks, autorange=autorange,
-> 2786 zorder=zorder, data=data)
2787 finally:
2788 ax._hold = washold
/anaconda/lib/python3.5/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs)
1890 warnings.warn(msg % (label_namer, func.__name__),
1891 RuntimeWarning, stacklevel=2)
-> 1892 return func(ax, *args, **kwargs)
1893 pre_doc = inner.__doc__
1894 if pre_doc is None:
/anaconda/lib/python3.5/site-packages/matplotlib/axes/_axes.py in boxplot(self, x, notch, sym, vert, whis, positions, widths, patch_artist, bootstrap, usermedians, conf_intervals, meanline, showmeans, showcaps, showbox, showfliers, boxprops, labels, flierprops, medianprops, meanprops, capprops, whiskerprops, manage_xticks, autorange, zorder)
3266 bootstrap = rcParams['boxplot.bootstrap']
3267 bxpstats = cbook.boxplot_stats(x, whis=whis, bootstrap=bootstrap,
-> 3268 labels=labels, autorange=autorange)
3269 if notch is None:
3270 notch = rcParams['boxplot.notch']
/anaconda/lib/python3.5/site-packages/matplotlib/cbook.py in boxplot_stats(X, whis, bootstrap, labels, autorange)
1984
1985 # convert X to a list of lists
-> 1986 X = _reshape_2D(X)
1987
1988 ncols = len(X)
/anaconda/lib/python3.5/site-packages/matplotlib/cbook.py in _reshape_2D(X)
2245 X = [X.ravel()]
2246 else:
-> 2247 X = [X[:, i] for i in xrange(ncols)]
2248 else:
2249 raise ValueError("input `X` must have 2 or fewer dimensions")
/anaconda/lib/python3.5/site-packages/matplotlib/cbook.py in <listcomp>(.0)
2245 X = [X.ravel()]
2246 else:
-> 2247 X = [X[:, i] for i in xrange(ncols)]
2248 else:
2249 raise ValueError("input `X` must have 2 or fewer dimensions")
/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in __getitem__(self, key)
2057 return self._getitem_multilevel(key)
2058 else:
-> 2059 return self._getitem_column(key)
2060
2061 def _getitem_column(self, key):
/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in _getitem_column(self, key)
2064 # get column
2065 if self.columns.is_unique:
-> 2066 return self._get_item_cache(key)
2067
2068 # duplicate columns & possible reduce dimensionality
/anaconda/lib/python3.5/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
1382 """Return the cached item, item represents a label indexer."""
1383 cache = self._item_cache
-> 1384 res = cache.get(item)
1385 if res is None:
1386 values = self._data.get(item)
TypeError: unhashable type: 'slice'
I haae searched the web for this and cannot seem to find answer for this issue. I will appreciate any help on this.

You are calling matplotlib to boxplot the DataFrame iris... as you are already using Pandas for importing the .csv you should also use it for plotting:
iris.boxplot()
Pandas boxplot api

Using the natural language toolkit in Jupyter notebook

Hello I am am trying to use the nltk to tokenize and generate some pos tags but I get error response in spite of of importing the nltk
bs=BeautifulSoup(web.text, 'html.parser')
print (bs)
tokes=nltk.word_tokenize (bs)
tags= nltk.pos_tag(tokes)
TypeError Traceback (most recent call last)
<ipython-input-71-f1434047d3f5> in <module>()
1 bs=BeautifulSoup(web.text, 'html.parser')
2 print (bs)
----> 3 tokes=nltk.word_tokenize (bs)
4 tags= nltk.pos_tag(tokes)
5 tags
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py in word_tokenize(text, language)
104 :param language: the model name in the Punkt corpus
105 """
--> 106 return [token for sent in sent_tokenize(text, language)
107 for token in _treebank_word_tokenize(sent)]
108
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py in sent_tokenize(text, language)
89 """
90 tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
---> 91 return tokenizer.tokenize(text)
92
93 # Standard word tokenizer.
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in tokenize(self, text, realign_boundaries)
1224 Given a text, returns a list of the sentences in that text.
1225 """
-> 1226 return list(self.sentences_from_text(text, realign_boundaries))
1227
1228 def debug_decisions(self, text):
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in sentences_from_text(self, text, realign_boundaries)
1272 follows the period.
1273 """
-> 1274 return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
1275
1276 def _slices_from_text(self, text):
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in span_tokenize(self, text, realign_boundaries)
1263 if realign_boundaries:
1264 slices = self._realign_boundaries(text, slices)
-> 1265 return [(sl.start, sl.stop) for sl in slices]
1266
1267 def sentences_from_text(self, text, realign_boundaries=True):
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in <listcomp>(.0)
1263 if realign_boundaries:
1264 slices = self._realign_boundaries(text, slices)
-> 1265 return [(sl.start, sl.stop) for sl in slices]
1266
1267 def sentences_from_text(self, text, realign_boundaries=True):
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in _realign_boundaries(self, text, slices)
1302 """
1303 realign = 0
-> 1304 for sl1, sl2 in _pair_iter(slices):
1305 sl1 = slice(sl1.start + realign, sl1.stop)
1306 if not sl2:
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in _pair_iter(it)
308 """
309 it = iter(it)
--> 310 prev = next(it)
311 for el in it:
312 yield (prev, el)
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in _slices_from_text(self, text)
1276 def _slices_from_text(self, text):
1277 last_break = 0
-> 1278 for match in self._lang_vars.period_context_re().finditer(text):
1279 context = match.group() + match.group('after_tok')
1280 if self.text_contains_sentbreak(context):
TypeError: expected string or bytes-like object
could anyone help me figure out where exactly i may have gone wrong with my syntax?

You're passing bs to the tokenize function when you should be passing bs.text

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

compute distance for data frame columns from python list - apache-spark

This is because your function returns a list. You can unpack: data.select('a','b', *dist_1('a','b')) or combine: data.select(['a','b'] + dist_1('a','b'))

Related

jax.lax.fori_loop Abstract tracer value encountered where concrete value is expected

Color heatmap in different colors by column in dataframe

Cannot allocate memory in multiprocessing python

Python 3; MatplotLib ; Box Plot Error

Using the natural language toolkit in Jupyter notebook

Categories

Resources