Featuretools documentation of specifying primitive options is wrong? - featuretools

The documentation says:
Specifying for Individual Primitives
Options for individual primitives or groups of primitives are set by the primitive_options parameter of DFS. This parameter maps any desired options to specific primitives. In the case of conflicting options, options set at this level will override options set at the entire DFS run level, and the include options will always take priority over their ignore counterparts.
However, I see that this is not true and the ignore option actually takes precedence over the include counterpart.
Below is the set-up I will use to demonstrate the claimed behaviour. It is an entityset with one grandparent (gp), two parents (p1,p2) and one child (c) to one parent (p1):
import pandas as pd
import featuretools as ft
from featuretools import variable_types as vt
# # Creating Relational Dataset
# ## Grand Parent
df_gp = pd.DataFrame({'gp_ind':['a','b'],
'gp_ncol1':[1,2],'gp_ncol2':[3,4],
'gp_ccol1':['x','y'],'gp_ccol2':['p','q'],
'gp_time_col1':pd.to_datetime(['20-01-2020','20-01-2019']),
'gp_time_ind':pd.to_datetime(['20-01-2021','20-01-2020'])})
# ## Parent 1
df_p1 = pd.DataFrame({'p1_ind':['a1','a2','b1'],
'p1_id': ['a','a','b'],
'p1_ncol1':[1,2,3],'p1_ncol2':[3,4,5],
'p1_ccol1':['x','y','z'],'p1_ccol2':['p','q','r'],
'p1_id1' : ['t','t','u'],
'p1_time_col1':pd.to_datetime(['16-01-2020','11-12-2019','16-01-2019'],format="%d-%m-%Y"),
'p1_time_ind':pd.to_datetime(['15-01-2021','10-12-2020','15-01-2020'],format="%d-%m-%Y")})
# ## Parent 2
df_p2 = pd.DataFrame({'p2_ind':['a1_','a2_','b1_'],
'p2_id': ['a','a','b'],
'p2_ncol1':[1,2,3],'p2_ncol2':[3,4,5],
'p2_ccol1':['x','y','z'],'p2_ccol2':['p','q','r'],
'p2_time_col1':pd.to_datetime(['18-01-2020','13-12-2019','18-01-2019'],format="%d-%m-%Y"),
'p2_time_ind':pd.to_datetime(['17-01-2021','12-12-2020','17-01-2020'],format="%d-%m-%Y")})
# ## Child
df_c = pd.DataFrame({'c_ind':['a1_1','a1_2','a2_1','a2_2','a2_3','b1_1'],
'c_id': ['a1','a1','a2','a2','a2','b1'],
'c_ncol1':[1,2,3,4,5,6],'c_ncol2':[3,4,5,6,7,8],
'c_ccol1':['x','y','z','a','b','c'],'c_ccol2':['p','q','r','s','t','u'],
'c_time_col1':pd.to_datetime(['13-01-2020','10-12-2019','8-12-2019','5-11-2019','2-10-2019','13-01-2019'],format="%d-%m-%Y"),
'c_time_ind':pd.to_datetime(['10-01-2021','5-12-2020','9-12-2020','6-11-2020','3-10-2019','12-01-2020'],format="%d-%m-%Y")})
# # Creating Entityset
es = ft.EntitySet(id='experimentation')
# ## Adding entities
# ### Adding gp
vt_gp = {'gp_ind':vt.Index,
'gp_ncol1':vt.Numeric,
'gp_ncol2':vt.Numeric,
'gp_ccol1':vt.Categorical,
'gp_ccol2':vt.Categorical,
'gp_time_col1':vt.Datetime,
'gp_time_ind':vt.DatetimeTimeIndex}
es.entity_from_dataframe(entity_id='gp',dataframe=df_gp,index='gp_ind',variable_types=vt_gp,
time_index='gp_time_ind')
# ### Adding p1
vt_p1 = {'p1_ind':vt.Index,
'p1_id':vt.Id,
'p1_id1' : vt.Id,
'p1_ncol1':vt.Numeric,
'p1_ncol2':vt.Numeric,
'p1_ccol1':vt.Categorical,
'p1_ccol2':vt.Categorical,
'p1_time_col1':vt.Datetime,
'p1_time_ind':vt.DatetimeTimeIndex}
es.entity_from_dataframe(entity_id='p1',dataframe=df_p1,index='p1_ind',variable_types=vt_p1,
time_index='p1_time_ind')
# ### Adding p2
vt_p2 = {'p2_ind':vt.Index,
'p2_id':vt.Id,
'p2_ncol1':vt.Numeric,
'p2_ncol2':vt.Numeric,
'p2_ccol1':vt.Categorical,
'p2_ccol2':vt.Categorical,
'p2_time_col1':vt.Datetime,
'p2_time_ind':vt.DatetimeTimeIndex}
es.entity_from_dataframe(entity_id='p2',dataframe=df_p2,index='p2_ind',variable_types=vt_p2,
time_index='p2_time_ind')
# ### Adding c
vt_c = {'c_ind':vt.Index,
'c_id':vt.Id,
'c_ncol1':vt.Numeric,
'c_ncol2':vt.Numeric,
'c_ccol1':vt.Categorical,
'c_ccol2':vt.Categorical,
'c_time_col1':vt.Datetime,
'c_time_ind':vt.DatetimeTimeIndex}
es.entity_from_dataframe(entity_id='c',dataframe=df_c,index='c_ind',variable_types=vt_c,
time_index='c_time_ind')
# ## Adding Relationships
r_gp_p1 = ft.Relationship(es['gp']['gp_ind'],es['p1']['p1_id'])
r_gp_p2 = ft.Relationship(es['gp']['gp_ind'],es['p2']['p2_id'])
r_p1_c = ft.Relationship(es['p1']['p1_ind'],es['c']['c_id'])
es.add_relationships([r_gp_p1,r_gp_p2,r_p1_c])
# ## Create Cutoff Times
cutoff_times = df_gp.loc[:,['gp_ind','gp_time_ind']].copy(deep=True)
# ## add interesting values
es['p1']['p1_ccol1'].interesting_values = es['p1'].df['p1_ccol1'].unique()[0:1]
es['c']['c_ccol1'].interesting_values = es['c'].df['c_ccol1'].unique()[0:1]
# ## Add last time index
es.add_last_time_indexes()
# ## Plotting entityset
es.plot()
Now on this entityset I run the following dfs:
I include p1 in both ignore_entities and include_entities keys. This way, I convey conflicting commands to dfs about whether or not to include p1 entity in the feature creation process.
Expected behaviour: include_entities to override ignore_entities and variables on entity p1 should be made
Behaviour seen: ignore_entities overrides include_entities and variables on p1 are not made
agg_primitives = ['sum']
where_primitives = ['sum']
primitive_options = {}
primitive_options[('sum',)] = {}
primitive_options[('sum',)]['ignore_entities'] = ['p1']
primitive_options[('sum',)]['include_entities'] = ['p1']
features = ft.dfs(entityset=es,target_entity='gp', cutoff_time=cutoff_times,
agg_primitives=agg_primitives,features_only=True,max_depth=2,
where_primitives = where_primitives,
primitive_options=primitive_options,trans_primitives=[])
features
output:
[<Feature: gp_ncol1>,
<Feature: gp_ncol2>,
<Feature: gp_ccol1>,
<Feature: gp_ccol2>]
No feature made on p1 which goes against what is stated in the documentation
Am I missing something here or is the documentation actually wrong as I see it and I should understand that ignore_entities overrides include_entities

This was a bug, you can track the proposed fix here: https://github.com/alteryx/featuretools/pull/1518

Related

Python pb3 serialization using pb3

I am using pb3 for serialization:
syntax = "proto3";
package marshalling;
import "google/protobuf/timestamp.proto";
message PrimitiveType {
oneof primitive_value {
bool boolean_value = 1;
int64 int_value = 2;
double double_value = 3;
google.protobuf.Timestamp timestamp_value = 4;
}
}
I generated a x_pb2.py file but do not know how to use it.
For example, if I would like to Marshall a timestamp to bytes, how could I do it?
With reference to The Protocol Buffer API section:
Unlike when you generate Java and C++ protocol buffer code, the Python protocol buffer compiler doesn't generate your data access code for you directly. Instead, it generates special descriptors for all your messages, enums, and fields, and some mysteriously empty classes, one for each message type...
and,
At load time, the GeneratedProtocolMessageType metaclass uses the specified descriptors to create all the Python methods you need to work with each message type and adds them to the relevant classes. You can then use the fully-populated classes in your code.
So, you can use the generated class(s) to create the object(s) and their fields like this:
p1 = primitive_types_pb2.PrimitiveType()
p1.int_value = 1234
For your use-case, you can use timestamp_pb2.Timestamp.GetCurrentTime().
Alternatively, you can refer to Timestamp along with timestamp_pb2.Timestamp.CopyFrom():
now = time.time()
seconds = int(now)
nanos = int((now - seconds) * 10**9)
timestamp = Timestamp(seconds=seconds, nanos=nanos)
p1 = primitive_types_pb2.PrimitiveType()
p1.timestamp_value.CopyFrom( timestamp )
There are other google.protobuf.timestamp_pb2 APIs that you might be interested in for your other use-cases.
Here's a complete working example (primitive_types.proto):
import time # For Timestamp.CopyFrom(). See commented code below
import primitive_types_pb2
from google.protobuf import timestamp_pb2
# serialization
p1 = primitive_types_pb2.PrimitiveType()
# Alternative to GetCurrentTime()
# now = time.time()
# seconds = int( now )
# nanos = int( (now - seconds) * 10**9 )
# timestamp = timestamp_pb2.Timestamp( seconds=seconds, nanos=nanos )
# p1.timestamp_value.CopyFrom( timestamp )
p1.timestamp_value.GetCurrentTime()
serialized = p1.SerializeToString()
# deserialization
p2 = primitive_types_pb2.PrimitiveType()
p2.ParseFromString( serialized )
print( p2.timestamp_value )
Output:
seconds: 1590581054
nanos: 648958000
References:
https://developers.google.com/protocol-buffers/docs/proto3#oneof
https://developers.google.com/protocol-buffers/docs/pythontutorial
https://developers.google.com/protocol-buffers/docs/reference/google.protobuf#timestamp
https://googleapis.dev/python/protobuf/latest/google/protobuf/timestamp_pb2.html

ITM (Irish Transverse Coordinate) conversion to GPS for google maps Python3

I don't know anything about coordinates. My problem is that I have a dataset that contains coordinates in the ITM format (Irish_X and Irish_Y). I want to convert ITM coordinates into Google Maps readable ones.
Online I found a library that may be useful but I don't know how to use and the documentation use a very specific jargon I'm not used to:
https://proj.org
I also commented in the same library gitHub repo looking for answer:
https://github.com/OSGeo/PROJ/issues/1687
Thank you very much for your help!
To convert from ITM to WGS84 just call the def itm2geo(x,y): function as exemplified in the end of the code in the Teste Values section.
The first two functions (arcmer and xy2geo) are auxiliary functions and don't need to be explicitly called (they are called by itm2geo(x,y)
from math import *
############################################################################
# Meridian Arc
############################################################################
def arcmer(a,equad,lat1,lat2):
b=a*sqrt(1-equad)
n=(a-b)/(a+b)
a0=1.+((n**2)/4.)+((n**4)/64.)
a2=(3./2.)*(n-((n**3)/8.))
a4=(15./16.)*((n**2)-((n**4)/4.))
a6=(35./48.)*(n**3)
s1=a/(1+n)*(a0*lat1-a2*sin(2.*lat1)+a4*sin(4.*lat1)-a6*sin(6.*lat1))
s2=a/(1+n)*(a0*lat2-a2*sin(2.*lat2)+a4*sin(4.*lat2)-a6*sin(6.*lat2))
return s2-s1
###############################################################################
#
# Transverse Mercator Inverse Projection
#
###############################################################################
def xy2geo(m,p,a,equad,lat0,lon0):
lat0=radians(lat0)
lon0=radians(lon0)
sigma1=p
fil=lat0+sigma1/(a*(1-equad))
deltafi=1
while deltafi > 0.0000000001:
sigma2=arcmer(a,equad,lat0,fil)
RO=a*(1-equad)/((1-equad*(sin(fil)**2))**(3./2.))
deltafi=(sigma1-sigma2)/RO
fil=fil+deltafi
N=a/sqrt(1-equad*(sin(fil))**2)
RO=a*(1-equad)/((1-equad*(sin(fil)**2))**(3./2.))
t=tan(fil)
psi=N/RO
lat=fil-(t/RO)*((m**2)/(2.*N))+(t/RO)*((m**4)/(24.*(N**3)))*(-4.*(psi**2)-9.*psi*(1.-t**2)+12.*(t**2))-(t/RO)*(m**6/(720.*(N**5)))*(8.*(psi**4)*(11.-24.*(t**2))-12.*(psi**3)*(21.-71.*(t**2))+15.*(psi**2)*(15.-98.*(t**2)+15.*(t**4))+180.*psi*(5.*(t**2)-3.*(t**4))-360.*(t**4))+(t/RO)*((m**8)/(40320.*(N**7)))*(1385.+3633.*(t**2)+4095.*(t**4)+1575.*(t**6))
lon=(m/(N))-((m**3)/(6.*(N**3)))*(psi+2.*(t**2))+((m**5)/(120.*(N**5)))*(-4.*(psi**3)*(1.-6.*(t**2))+(psi**2)*(9.-68.*(t**2))+72.*psi*(t**2)+24.*(t**4))-((m**7)/(5040.*(N**7)))*(61.+662.*(t**2)+1320.*(t**4)+720.*(t**6))
lon=lon0+lon/cos(fil)
lat=degrees(lat)
lon=degrees(lon)
return lat,lon
#############################################################################
# Irish Transverse Mercator - Inverse
#############################################################################
def itm2geo(x,y):
# GRS-80
a = 6378137.
equad =0.00669437999
# Natural Origin
lat0=53.5
lon0=-8.
k0=0.999820
p = (y - 750000.) /k0
m = (x - 600000.) /k0
lat,lon = xy2geo(m,p,a,equad,lat0,lon0)
return lat,lon
#############################################################################
# Test values
#############################################################################
#lat=53.5
#lon=-8.
test = itm2geo(600000.,750000.)
print ("latitude= %.16f" %test[0])
print ("longitude= %.16f" %test[1])

Parameter aliasing

when implementing Origen::Parameters, I understood the importance of defining a 'default' set. But, in essence, my real default is named something different. So I implemented a hack of a parameter alias:
Origen.top_level.define_params :default do |params|
params.tconds.override = 1
params.tconds.override_lev_equ_set = 1
params.tconds.override_lev_spec_set = 1
params.tconds.override_levset = 1
params.tconds.override_seqlbl = 'my_pattern'
params.tconds.override_testf = 'tm_3'
params.tconds.override_tim_spec_set = 'bist_xxMhz'
params.tconds.override_timset = '1,1,1,1,1,1,1,1'
params.tconds.site_control = 'parallel:'
params.tconds.site_match = 2
end
Origen.top_level.define_params :cpu_mbist_hr, inherit: :default do |params|
# way of aliasing parameter names
end
Is there a proper method of parameter aliasing that is just not documented?
There is no other way to do this currently, though I would be open to a PR to enable something like:
default_params = :cpu_mbist_hr
If you don't want them to be called :default in this case though, then maybe you don't really want them to be the default anyway.
e.g. adding this immediately after you define them would effectively give you an alternative default and would do pretty much the same job as the proposed API above:
# self is required here to help Ruby know that you are calling the params= API
# and not defining a local variable called params
self.params = :cpu_mbist_hr

QTreeView crashing with no apparent reason

I introduced a treeview in the GUI of the program I'm making and since it crashes when I attempt to change its model once it has been set.
The course of action is:
load the file using a file dialogue
clearing the models on the interface objects (tables and treeview). The first time the treeview is not affected since there is no model in
it.
Populate the treeview model.
other stuff not related to the issue.
The problematic functions are;
The file loading procedure:
def open_file(self):
"""
Open a file
:return:
"""
print("actionOpen_file_click")
# declare the dialog
# file_dialog = QtGui.QFileDialog(self)
# declare the allowed file types
files_types = "Excel 97 (*.xls);;Excel (*.xlsx);;DigSILENT (*.dgs);;MATPOWER (*.m)"
# call dialog to select the file
filename, type_selected = QtGui.QFileDialog.getOpenFileNameAndFilter(self, 'Open file',
self.project_directory, files_types)
if len(filename) > 0:
self.project_directory = os.path.dirname(filename)
print(filename)
self.circuit = Circuit(filename, True)
# set data structures list model
self.ui.dataStructuresListView.setModel(self.available_data_structures_listModel)
# set the first index
index = self.available_data_structures_listModel.index(0, 0, QtCore.QModelIndex())
self.ui.dataStructuresListView.setCurrentIndex(index)
# clean
self.clean_GUI()
# load table
self.display_objects_table()
# draw graph
self.ui.gridPlot.setTitle(os.path.basename(filename))
self.re_plot()
# show times
if self.circuit.time_series is not None:
if self.circuit.time_series.is_ready():
self.set_time_comboboxes()
# tree view at the results
self.set_results_treeview_structure()
# populate editors
self.populate_editors_defaults()
The treeview model assignation:
def set_results_treeview_structure(self):
"""
Sets the results treeview data structure
#return:
"""
# self.ui.results_treeView.setSelectionBehavior(QtGui.QAbstractItemView.SelectRows)
model = QtGui.QStandardItemModel()
# model.setHorizontalHeaderLabels(['Elements'])
self.ui.results_treeView.setModel(model)
# self.ui.results_treeView.setUniformRowHeights(True)
def pass_to_QStandardItem_list(list_):
res = list()
for elm in list_:
elm1 = QtGui.QStandardItem(elm)
elm1.setEditable(False)
res.append(elm1)
return res
bus_results = pass_to_QStandardItem_list(['Voltages (p.u.)', 'Voltages (kV)'])
per_bus_results = pass_to_QStandardItem_list(['Voltage (p.u.) series', 'Voltage (kV) series',
'Active power (MW)', 'Reactive power (MVar)',
'Active and reactive power (MW, MVar)', 'Aparent power (MVA)',
'S-V curve', 'Q-V curve'])
branches_results = pass_to_QStandardItem_list(['Loading (%)', 'Current (p.u.)',
'Current (kA)', 'Losses (MVA)'])
per_branch_results = pass_to_QStandardItem_list(['Loading (%) series', 'Current (p.u.) series',
'Current (kA) series', 'Losses (MVA) series'])
generator_results = pass_to_QStandardItem_list(['Reactive power (p.u.)', 'Reactive power (MVar)'])
per_generator_results = pass_to_QStandardItem_list(['Reactive power (p.u.) series',
'Reactive power (MVar) series'])
self.family_results_per_family = dict()
# nodes
buses = QtGui.QStandardItem('Buses')
buses.setEditable(False)
buses.appendRows(bus_results)
self.family_results_per_family[0] = len(bus_results)
names = self.circuit.bus_names
for name in names:
bus = QtGui.QStandardItem(name)
bus.appendRows(per_bus_results)
bus.setEditable(False)
buses.appendRow(bus)
# branches
branches = QtGui.QStandardItem('Branches')
branches.setEditable(False)
branches.appendRows(branches_results)
self.family_results_per_family[1] = len(branches_results)
names = self.circuit.branch_names
for name in names:
branch = QtGui.QStandardItem(name)
branch.appendRows(per_branch_results)
branch.setEditable(False)
branches.appendRow(branch)
# generators
generators = QtGui.QStandardItem('Generators')
generators.setEditable(False)
generators.appendRows(generator_results)
self.family_results_per_family[2] = len(generator_results)
names = self.circuit.gen_names
for name in names:
gen = QtGui.QStandardItem(name)
gen.appendRows(per_generator_results)
gen.setEditable(False)
generators.appendRow(gen)
model.appendRow(buses)
model.appendRow(branches)
model.appendRow(generators)
And the GUI "cleaning":
def clean_GUI(self):
"""
Initializes the comboboxes and tables
Returns:
"""
self.ui.tableView.setModel(None)
if self.ui.results_treeView.model() is not None:
self.ui.results_treeView.model().clear()
self.ui.profile_time_selection_comboBox.clear()
self.ui.results_time_selection_comboBox.clear()
self.ui.gridPlot.clear()
The complete code can be seen here
I have seen that this behavior is usually triggered by calls outside the GUI thread by I don think this is the case.
I'd appreciate if someone could point out the problem. Again the complate code for test is here.
The solution to this in my case has been the following:
The QStandardItemModel() variable called model in the code was turned into a class global variable self.tree_model
When I want to replace the treeview object model, I delete the global tree_model with del self.tree_model
Then I re-create the global model with self.tree_model = QStandardItemModel()
This way the TreeView object model is effectively replaced without crashing...

Include monotonically increasing value in logstash field?

I know there's no built in "line count" functionality while processing files through logstash (for various, understandable and documented reasons). But - there should be a mechanism, within any given logstash instance - to have an monotonically increasing variable / count for every parsed line.
I don't want to go the metrics route since it's a continuous polling mechanism (every n-seconds). Alternatives include pre-processing of log files which given my particular use case - is unacceptable.
Again, let me reiterate - I need the ability to generate/read a monotonically increasing variable that I can store during in a logstash filter.
Thoughts?
here's nothing built into logstash to do it.
You can build a filter to do it pretty easily
Just drop something like this into lib/logstash/filters/seq.rb
# encoding: utf-8
require "logstash/filters/base"
require "logstash/namespace"
require "set"
#
# This filter will adds a sequence number to a log entry
#
# The config looks like this:
#
# filter {
# seq {
# field => "seq"
# }
# }
#
# The `field` is the field you want added to the event.
class LogStash::Filters::Seq < LogStash::Filters::Base
config_name "seq"
milestone 1
config :field, :validate => :string, :required => false, :default => "seq"
public
def register
# Nothing
end # def register
public
def initialize(config = {})
super
#threadsafe = false
# This filter needs to keep state.
#seq=1
end # def initialize
public
def filter(event)
return unless filter?(event)
event[#field] = #seq
#seq = #seq + 1
filter_matched(event)
end # def filter
end # class LogStash::Filters::Seq
This will start at 1 every time Logstash is restarted, but for most situations, this would be ok. If you need something that is persistent across restarts, you need to do a bit more work to persist it somewhere
For anyone finding this in 2018+: logstash now has a ruby filter that makes this much simpler. Put the following in a file somewhere:
# encoding: utf-8
def register(params)
#seq = 1
end
def filter(event)
event.set("seq", #seq)
#seq += 1
return [event]
end
And then configure it like this in your logstash.conf (substitute in the filename you used):
ruby {
path => "/usr/local/lib/logstash/seq.rb"
}
It would be pretty easy to make the field name configurable from logstash.conf, but I'll leave that as an exercise for the reader.
I suspect this isn't thread-safe, so I'm running only a single logstash worker.
this is another choice to slove the problem,this work for me,thanks to the answer from the previous person about thread safe. i use seq field to sort my desc
this is my configure
logstash.conf
filter {
ruby {
code => 'event.set("seq", Time.now.strftime("%N").to_i)'
}
}
logstash.yml
pipeline.batch.size: 200
pipeline.batch.delay: 60
pipeline.workers: 1
pipeline.output.workers: 1

Resources