When I use vtkPolyDataWriter to create a vtk legacy file, I obtain the new version of the data file (5.1) with connectivities and offsets. Is it possible to change that and get the 'legacy' old format?
It seems gmsh is not able to read vtk file with the new format version..
(I work with python 3.8 and vtk package version 9.0.1)
Nope, you cannot programmatically choose the version. The only way to write with an old format is to downgrade vtk to the desired version.
For those interested, I post the code I used to convert new to old format. It works for polydata and unstructured grid vtk meshes.
import math
import re
def convert_to_old_format(mesh_fname: str, save_fname: str, copy_lines: bool = False):
conv = Converter(mesh_fname, save_fname, copy_lines)
conv.read()
conv.replace()
conv.write()
def change_vtk_version(filename: str, v: float = 5.1):
with open(filename, "r") as f:
x = f.read()
x = re.sub(r"(# vtk DataFile Version) (.+)", f"\\1 {v}", x)
with open(filename, "w") as f:
f.write(x)
class Converter:
def __init__(self, inp, out=None, copy_lines=False):
if out is None:
out = inp
self.inp = inp
self.out = out
self.copy_lines = copy_lines # if line cells should be copied
self.original = None
self.lines = None
self.polys = None
self.cells = None
def read(self):
with open(self.inp, "r") as f:
self.original = f.read().split("\n")
lines_original = list(map(lambda x: x.strip().split(), self.original))
for i, l in enumerate(self.original):
if "LINES" in l:
self.lines = NewContent("LINES", lines_original, i)
elif "POLYGONS" in l:
self.polys = NewContent("POLYGONS", lines_original, i)
elif "CELLS" in l:
self.cells = NewContent("CELLS", lines_original, i)
def replace(self):
if self.polys is not None:
self.original = self.polys.replace(self.original)
if self.cells is not None:
self.original = self.cells.replace(self.original)
if self.lines is not None:
self.original = self.lines.replace(self.original, replace=self.copy_lines)
def write(self):
with open(self.out, "w") as f:
f.write("\n".join(self.original))
change_vtk_version(self.out, 4.2)
class NewContent:
def __init__(self, kw, content, ln):
self.kw = kw
self.ln = ln
self.name = content[ln][0]
self.no = int(content[ln][1])
self.nc = int(content[ln][2])
flat_list = [item for line in content[ln + 2 :] for item in line]
flat_list = list(filter("".__ne__, flat_list))
self.offsets = list(map(int, flat_list[0 : self.no]))
self.connectivity = list(
map(int, flat_list[self.no + 2 : self.no + 2 + self.nc])
)
#property
def remove(self):
return self.ln, self.ln + math.ceil(self.no / 9) + math.ceil(self.nc / 9) + 3
def replace(self, lines, replace=True):
nb_cells = self.no - 1
new_content = []
if replace:
new_content = [f"{self.kw} {nb_cells} {nb_cells + self.nc}"]
for i in range(nb_cells):
nb_points = self.offsets[i + 1] - self.offsets[i]
ids = self.connectivity[self.offsets[i] : self.offsets[i + 1]]
new_content.append(f"{nb_points} {' '.join(map(str, ids))}")
lines_to_keep = lines
a, b = self.remove
del lines_to_keep[a:b]
lines_to_keep[a:a] = new_content
lines_to_keep = list(filter("".__ne__, lines_to_keep))
return lines_to_keep
if __name__ == "__main__":
convert_to_old_format("mesh.vtk", "mesh_old_format.vtk")
# change_vtk_version("mesh.vtk", 8.6)
Related
Here I have my Python code code, I don't understand why I am getting the following error. Any guidance or help would be much appreciated.
UnboundLocalError: local variable 'top_performer' referenced before assignment
def create(X, y, **kwargs):
method = kwargs.get("method", None)
#method = kwargs.get("method", "Binary_operators")
#method = kwargs.get("method", "Binning")
#method = kwargs.pop("method", "Cluster")
#categorical_cols = [c for c, t in zip(X.columns, X_column_types) if t in [DATATYPE_CATEGORY_INT, DATATYPE_CATEGORY_STRING]]
#numerical_cols = [c for c, t in zip(X.columns, X_column_types) if t == DATATYPE_NUMBER]
#categorical = X[categorical_cols]
#numerical = X[numerical_cols]
categorical = X.select_dtypes(include=[object])
numerical = X.select_dtypes(exclude=[object])
# feature selection using Genetic Algorithm
if method == "fs_GA":
print("fs_GA")
enc = OneHotEncoder()
enc.fit(categorical)
Data_cat=pd.DataFrame(enc.transform(categorical).toarray())
X_data = pd.concat([numerical, Data_cat], axis=1)
if y.dtype == int:
y = y
else:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y)
y = le.transform(y)
X_train, X_test, y_train, y_test = train_test_split(X_data, y, train_size=0.8, random_state=42)
def get_fitness(individual):
if y.dtype == int:
rg = RandomForestRegressor(random_state=42)
else:
rg = RandomForestClassifier(random_state=42)
columns = [column for (column, binary_value) in zip(X_train.columns, individual) if binary_value]
training_set = X_train[columns]
test_set = X_test[columns]
rg.fit(training_set.values, y_train)
preds = rg.predict(test_set.values)
return 100 / np.sqrt(mean_squared_error(y_test, preds))
individual = [1] * 100
get_fitness(individual)
def get_population_fitness(population):
return sorted([(individual, get_fitness(individual)) for individual in population], key=lambda tup: tup[1], reverse=True)
def crossover(individual_a, individual_b):
crossing_point = random.randint(0, 99)
offspring_a = individual_a[0:crossing_point] + individual_b[crossing_point:100]
offspring_b = individual_b[0:crossing_point] + individual_a[crossing_point:100]
return offspring_a, offspring_b
def tournament(current_population):
index = sorted(random.sample(range(0, 20), 5))
tournament_members = [current_population[i] for i in index]
total_fitness = sum([individual[1] for individual in tournament_members])
probabilities = [individual[1] / total_fitness for individual in tournament_members]
index_a, index_b = np.random.choice(5, size=2, p=probabilities)
return crossover(tournament_members[index_a][0], tournament_members[index_b][0])
def mutation(individual):
mutation_point = random.randint(0, 99)
if(individual[mutation_point]):
individual[mutation_point] = 0
else:
individual[mutation_point] = 1
def build_next_generation(current_population, mutation_rate):
next_generation = []
next_generation.append(current_population[0][0]) # elitism
next_generation.append(current_population[random.randint(1,19)][0]) # randomness
for i in range(9): # tournaments
offspring_a, offspring_b = tournament(current_population)
next_generation.append(offspring_a)
next_generation.append(offspring_b)
for individual in next_generation: # mutation
if(random.randint(1,mutation_rate) == 1):
mutation(individual)
return next_generation
def run_ga(current_population, num_of_generations, mutation_rate=1000):
fittest_individuals = []
for i in range(num_of_generations):
current_population = get_population_fitness(current_population) # get pop fitness
fittest_individuals.append(current_population[0]) # record fittest individual (for graphing and analysis)
current_population = build_next_generation(current_population, mutation_rate) # make new population
return fittest_individuals
initial_population = [[random.randint(0, 1) for i in range(100)] for i in range(20)]
high_mutation_fittest = run_ga(initial_population, 100, mutation_rate=5)
high_mutation_fitness = [ind[1] for ind in high_mutation_fittest]
for item in high_mutation_fittest[:-1]:
if item[1] == max(high_mutation_fitness):
top_performer = item
break
print("Total features included: " + str(top_performer[0].count(1)))
selected_features = [column for (column, binary_value) in zip(X.columns, top_performer[0]) if binary_value]
excluded_features = [column for (column, binary_value) in zip(X.columns, top_performer[0]) if not binary_value]
X = X[selected_features]
if method == "Binary_operators":
print("binaryoperators")
if method == "Binning":
print("binning")
else:
print("Discretization")
if method == "Cluster":
print("clustering")
else:
print("no-cluster")
print("normal_autocross")
So when I run the code I get the following error and I don't seem to understand what it means. Can someone please explain to me why i'm getting this error?
create(X, y, method="fs_GA")
fs_GA
UnboundLocalError Traceback (most recent call last)
in
----> 1 create(X, y, method="fs_GA")
in create(X, y, **kwargs)
107 top_performer = item
108 break
--> 109 print("Total features included: " + str(top_performer[0].count(1)))
110
111 selected_features = [column for (column, binary_value) in zip(X.columns, top_performer[0]) if binary_value]
UnboundLocalError: local variable 'top_performer' referenced before assignment
top_performer = 0
for item in high_mutation_fittest[:-1]:
if item[1] == max(high_mutation_fitness):
top_performer = item
break
print("Total features included: " + str(top_performer[0].count(1)))
According to your code top_performer is an int variable, not an array, str(top_performer) is correct way of using it. str(top_performer).count('1') , this could be what you are looking for. count is for string not int
I have a dataset with Russian text, which looks like this:
I am trying to pre-process this dataset and split it to train,dev and testing datasets by using the following code:
# coding=utf-8
import os
import argparse
import xml.etree.ElementTree as ET
import random
import math
from collections import Counter
from utils import semeval2014term_to_aspectsentiment_hr
from copy import copy, deepcopy
parser = argparse.ArgumentParser(description='Generate finetuning corpus for restaurants.')
parser.add_argument('--noconfl',
action='store_true',
default=False,
help='Remove conflicting sentiments from labels')
parser.add_argument('--istrain',
action='store_true',
default=False,
help='If is a training set we split of 10% and output train_full, train_split, dev. Default is testset creating no split')
parser.add_argument("--files",
type=str,
nargs='+',
action="store",
help="File that contains the data used for training. Multiple paths will mix the datasets.")
parser.add_argument("--output_dir",
type=str,
action="store",
default="data/transformed/untitled",
help="output dir of the dataset(s)")
parser.add_argument("--upsample",
type=str,
action="store",
default=None,
help="please add a string with 3 numbers like '0.5 0.3 0.2' representing relative numbers of 'POS NEG NEU' adding to 1"
" which represents target distribution - only valid in non-confl case")
parser.add_argument("--seed",
type=int,
action="store",
default=41,
help="random seed, effects on upsampling and validationset")
args = parser.parse_args()
# 1. Load The Dataset
# 2. Create Bert-Pair Style Format
# 3. Save Train, Validation and so on
def split_shuffle_array(ratio, array, rseed):
# split_ratio_restaurant = .076 # for 150 sentence in conflicting case
# split_ratio_laptops = .101 # for 150 sentences in conflicting case
random.Random(rseed).shuffle(array)
m = math.floor(ratio * len(array))
return array[0:m], array[m::]
def create_sentence_pairs(sents, aspect_term_sentiments):
# create sentence_pairs
all_sentiments = []
sentence_pairs = []
labels = []
for ix, ats in enumerate(aspect_term_sentiments):
s = sents[ix]
for k, v in ats:
all_sentiments.append(v)
sentence_pairs.append((s, k))
labels.append(v)
counts = Counter(all_sentiments)
return sentence_pairs, labels, counts
def upsample_data(sentence_pairs, labels, target_ratios={'POS': 0.53, 'NEG': 0.21, 'NEU': 0.26}):
# one question: should we upsample sentencepairs, where the sentence only occurs once?!
print('Upsampling data ...')
# print(sentence_pairs, labels) # is list of pairs -> decide which pair to upsample ...
# 0. compute indeex subsets for every example
# 1. compute how many samples to sample ->
ix_subsets = {
'POS': [],
'NEG': [],
'NEU': []
}
ratios_subsets = {
'POS': 0,
'NEG': 0,
'NEU': 0
}
examples_to_add = {
'POS': 0,
'NEG': 0,
'NEU': 0
}
n = float(len(labels))
for ix, l in enumerate(labels):
ix_subsets[l].append(ix)
ratios_subsets[l] += (1.0 / n)
t_keys = target_ratios.keys()
tmp = [math.floor(target_ratios[k] * n) - len(ix_subsets[k]) for k in t_keys]
class_nothing_to_add = list(t_keys)[tmp.index(min(tmp))]
print(t_keys)
print(ratios_subsets)
print(tmp)
print(class_nothing_to_add)
# print(ix_subsets)
m = len(ix_subsets[class_nothing_to_add]) / target_ratios[class_nothing_to_add]
total_to_add = m - n
print(n, math.floor(m))
examples_to_add = {k: math.floor(target_ratios[k] * m - len(ix_subsets[k])) for k in t_keys}
print(examples_to_add) # so we need to add more neutral examples and more positiev ones
# downsampling would be set 0 the maximum amount of negative ones
# now select all the indices, with replacement because it can be more than double
new_samples = []
for k in t_keys:
new_samples.extend(random.Random(args.seed).choices(ix_subsets[k], k=examples_to_add[k]))
print(len(new_samples))
# now add all new samples to the dataset and shuffle it
new_sentence_pairs = copy(sentence_pairs)
new_labels = labels.copy()
for ix in new_samples:
new_sentence_pairs.append(copy(sentence_pairs[ix]))
new_labels.append(labels[ix])
random.Random(args.seed).shuffle(new_sentence_pairs)
random.Random(args.seed).shuffle(new_labels)
print(len(set(new_sentence_pairs)))
print(len(set(sentence_pairs)))
return new_sentence_pairs, new_labels
def export_dataset_to_xml(fn, sentence_pairs, labels):
# export in format semeval 2014, incomplete though! just for loading with existing dataloaders for ATSC
sentences_el = ET.Element('sentences')
sentimap_reverse = {
'POS': 'positive',
'NEU': 'neutral',
'NEG': 'negative',
'CONF': 'conflict'
}
for ix, (sentence, aspectterm) in enumerate(sentence_pairs):
# print(sentence)
sentiment = labels[ix]
sentence_el = ET.SubElement(sentences_el, 'sentence')
sentence_el.set('id', str(ix))
text = ET.SubElement(sentence_el, 'text')
text.text = str(sentence).strip()
aspect_terms_el = ET.SubElement(sentence_el, 'aspectTerms')
aspect_term_el = ET.SubElement(aspect_terms_el, 'aspectTerm')
aspect_term_el.set('term', aspectterm)
aspect_term_el.set('polarity', sentimap_reverse[sentiment])
aspect_term_el.set('from', str('0'))
aspect_term_el.set('to', str('0'))
def indent(elem, level=0):
i = "\n" + level * " "
j = "\n" + (level - 1) * " "
if len(elem):
if not elem.text or not elem.text.strip():
elem.text = i + " "
if not elem.tail or not elem.tail.strip():
elem.tail = i
for subelem in elem:
indent(subelem, level + 1)
if not elem.tail or not elem.tail.strip():
elem.tail = j
else:
if level and (not elem.tail or not elem.tail.strip()):
elem.tail = j
return elem
indent(sentences_el)
# mydata = ET.dump(sentences_el)
mydata = ET.tostring(sentences_el)
with open(fn, "wb") as f:
# f.write('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
f.write(mydata)
f.close()
def save_dataset_to_tsv(fn, data):
pass
sentence_pairs_train_mixed = []
sentence_pairs_trainsplit_mixed = []
sentence_pairs_dev_mixed = []
sentence_pairs_test_mixed = []
labels_train_mixed = []
labels_trainsplit_mixed = []
labels_dev_mixed = []
labels_test_mixed = []
for fn in args.files:
print(args.output_dir)
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
print(fn)
sents_train, ats_train, idx2labels = semeval2014term_to_aspectsentiment_hr(fn,
remove_conflicting=args.noconfl)
sentence_pairs_train, labels_train, counts_train = create_sentence_pairs(sents_train, ats_train)
if args.istrain:
sents_dev, sents_trainsplit = split_shuffle_array(.1, sents_train, 41)
ats_dev, ats_trainsplit = split_shuffle_array(.1, ats_train, 41)
sentence_pairs_dev, labels_dev, counts_dev = create_sentence_pairs(sents_dev, ats_dev)
sentence_pairs_trainsplit, labels_trainsplit, counts_trainsplit = create_sentence_pairs(sents_trainsplit,
ats_trainsplit)
print_dataset_stats('Train', sents_train, sentence_pairs_train, counts_train)
print_dataset_stats('Dev', sents_dev, sentence_pairs_dev, counts_dev)
print_dataset_stats('TrainSplit', sents_trainsplit, sentence_pairs_trainsplit, counts_trainsplit)
sentence_pairs_trainsplit_mixed += sentence_pairs_trainsplit
sentence_pairs_train_mixed += sentence_pairs_train
sentence_pairs_dev_mixed += sentence_pairs_dev
labels_trainsplit_mixed += labels_trainsplit
labels_train_mixed += labels_train
labels_dev_mixed += labels_dev
if len(args.files) == 1:
if args.upsample:
distro_arr = args.upsample.split(' ')
pos = float(distro_arr[0])
neg = float(distro_arr[1])
neu = float(distro_arr[2])
assert pos + neg + neu == 1.0, 'upsampling target distribution does not sum to 1'
target_distro = {'POS': pos, 'NEG': neg, 'NEU': neu}
print('Target Sampling Distribution for Training Set:', target_distro)
sentence_pairs_train, labels_train = upsample_data(sentence_pairs_train, labels_train, target_ratios=target_distro)
export_dataset_to_xml(args.output_dir + '/train.xml', sentence_pairs_train, labels_train)
export_dataset_to_xml(args.output_dir + '/dev.xml', sentence_pairs_dev, labels_dev)
export_dataset_to_xml(args.output_dir + '/train_split.xml', sentence_pairs_trainsplit, labels_trainsplit)
else:
sentence_pairs_test_mixed += sentence_pairs_train
labels_test_mixed += labels_train
print_dataset_stats('Test', sents_train, sentence_pairs_train, counts_train)
if len(args.files) == 1:
export_dataset_to_xml(args.output_dir + '/test.xml', sentence_pairs_train, labels_train)
if len(args.files) > 1:
if args.istrain:
export_dataset_to_xml(args.output_dir + '/train.xml', sentence_pairs_train_mixed, labels_train_mixed)
export_dataset_to_xml(args.output_dir + '/dev.xml', sentence_pairs_dev_mixed, labels_dev_mixed)
export_dataset_to_xml(args.output_dir + '/train_split.xml', sentence_pairs_trainsplit_mixed,
labels_trainsplit_mixed)
else:
export_dataset_to_xml(args.output_dir + '/test.xml', sentence_pairs_test_mixed, labels_test_mixed)
After running the code above I have this result:
For English text it works just fine. Could someone help me to fix this and get normal text?
ET.tostring(sentences_el, encoding='UTF-8')
I'm learning Python from this lecture: Lec 19 | MIT 6.00 Introduction to Computer Science and Programming. I'm using Python 3.6.2, lecture example runs on Python 2.x. Whats the proper way to set values of x and y in function ans_quest?
x, y = loc_list[-1].get_coords()
Can this method be called like this? This was the example in the lecture.
Full code:
import math, random, pylab, copy
class Location(object):
def __init__(self, x, y):
self.x = float(x)
self.y = float(y)
def move(self, xc, yc):
return Location(self.x+float(xc), self.y+float(yc))
def get_coords(self):
return self.x, self.y
def get_dist(self, other):
ox, oy = other.get_coords()
x_dist = self.x - ox
y_dist = self.y - oy
return math.sqrt(x_dist**2 + y_dist**2)
class Compass_Pt(object):
possibles = ('N', 'S', 'E', 'W')
def __init__(self, pt):
if pt in self.possibles: self.pt = pt
else: raise ValueError('in Compass_Pt.__init__')
def move(self, dist):
if self.pt == 'N': return (0, dist)
elif self.pt == 'S': return (0, -dist)
elif self.pt == 'E': return (dist, 0)
elif self.pt == 'W': return (-dist, 0)
else: raise ValueError('in Compass_Pt.move')
class Field(object):
''' Cartesian plane where object will be located '''
def __init__(self, drunk, loc):
self.drunk = drunk
self.loc = loc
def move(self, cp, dist):
old_loc = self.loc
xc, yc = cp.move(dist)
self.loc = old_loc.move(xc, yc)
def get_loc(self):
return self.loc
def get_drunk(self):
return self.drunk
class Drunk(object):
''' Point itself '''
def __init__(self, name):
self.name = name
def move(self, field, cp, dist = 1):
if field.get_drunk().name != self.name:
raise ValueError('Drunk.move called with drunk not in the field')
for i in range(dist):
field.move(cp, 1)
class Usual_Drunk(Drunk):
def move(self, field, dist = 1):
''' Drunk.move superclass method override. Sends additional cp attribute.'''
cp = random.choice(Compass_Pt.possibles)
Drunk.move(self, field, Compass_Pt(cp), dist)
class Cold_Drunk(Drunk):
def move(self, field, dist = 1):
cp = random.choice(Compass_Pt.possibles)
if cp == 'S':
Drunk.move(self, field, Compass_Pt(cp), 2*dist)
else:
Drunk.move(self, field, Compass_Pt(cp), dist)
class EW_Drunk(Drunk):
def move(self, field, time = 1):
cp = random.choice(Compass_Pt.possibles)
while cp != 'E' and cp != 'W':
cp = random.choice(Compass_Pt.possibles)
Drunk.move(self, field, Compass_Pt(cp), time)
def perform_trial(time, f):
start = f.get_loc()
distances = [0,0]
for t in range(1, time + 1):
f.get_drunk().move(f)
new_loc = f.get_loc()
distance = new_loc.get_dist(start)
distances.append(distance)
return distances
def perform_sim(time, num_trials, drunk_type):
dist_lists = []
loc_lists = []
for trial in range(num_trials):
d = drunk_type('Drunk' + str(trial))
f = Field(d, Location(0, 0))
distances = perform_trial(time, f)
locs = copy.deepcopy(distances)
dist_lists.append(distances)
loc_lists.append(locs)
return dist_lists, loc_lists
def ans_quest(max_time, num_trials, drunk_type, title):
dist_lists, loc_lists = perform_sim(max_time, num_trials, drunk_type)
means = []
for t in range(max_time + 1):
tot = 0.0
for dist_l in dist_lists:
tot += dist_l[t]
means.append(tot/len(dist_lists))
pylab.figure()
pylab.plot(means)
pylab.ylabel('distance')
pylab.xlabel('time')
pylab.title('{} Ave. Distance'.format(title))
lastX = []
lastY = []
for loc_list in loc_lists:
x, y = loc_list[-1].get_coords()
lastX.append(x)
lastY.append(y)
pylab.figure()
pylab.scatter(lastX, lastY)
pylab.ylabel('NW Distance')
pylab.title('{} Final location'.format(title))
pylab.figure()
pylab.hist(lastX)
pylab.xlabel('EW Value')
pylab.ylabel('Number of Trials')
pylab.title('{} Distribution of Final EW Values'.format(title))
num_steps = 50
num_trials = 10
ans_quest(num_steps, num_trials, Usual_Drunk, 'Usual Drunk ' + str(num_trials) + ' Trials')
ans_quest(num_steps, num_trials, Cold_Drunk, 'Cold Drunk ' + str(num_trials) + ' Trials')
ans_quest(num_steps, num_trials, EW_Drunk, 'EW Drunk ' + str(num_trials) + ' Trials')
pylab.show()
Error:
Traceback (most recent call last):
File "/home/tihe/Documents/CODING/Project Home/Python/biased_random_walks.py", line 194, in <module>
ans_quest(num_steps, num_trials, Usual_Drunk, 'Usual Drunk ' + str(num_trials) + ' Trials')
File "/home/tihe/Documents/CODING/Project Home/Python/biased_random_walks.py", line 175, in ans_quest
x, y = loc_list[-1].get_coords()
AttributeError: 'float' object has no attribute 'get_coords'
This method could be called like this if you had a list of Location objects. The error is because the loc_list is populated with distances and not Location objects. That happens in function perform_sim when instead of geting the location you are making a deep copy of distance.
Perhaps you could try something like this:
def perform_trial(time, f):
start = f.get_loc()
distances = [0,0]
locations = []
for t in range(1, time + 1):
f.get_drunk().move(f)
new_loc = f.get_loc()
locations.append(new_loc)
distance = new_loc.get_dist(start)
distances.append(distance)
return distances, locations
def perform_sim(time, num_trials, drunk_type):
dist_lists = []
loc_lists = []
for trial in range(num_trials):
d = drunk_type('Drunk' + str(trial))
f = Field(d, Location(0, 0))
distances, locations = perform_trial(time, f)
dist_lists.append(distances)
loc_lists.append(locations)
return dist_lists, loc_lists
I hope that helped you out.
This is a python script for detecting features in a set of images for a SVM.
import os
import sys
import argparse
import _pickle as cPickle
import json
import cv2
import numpy as np
from sklearn.cluster import KMeans
def build_arg_parser():
parser = argparse.ArgumentParser(description='Creates features for given images')
parser.add_argument("--samples", dest="cls", nargs="+", action="append",
required=True, help="Folders containing the training images. \
The first element needs to be the class label.")
parser.add_argument("--codebook-file", dest='codebook_file', required=True,
help="Base file name to store the codebook")
parser.add_argument("--feature-map-file", dest='feature_map_file', required=True,
help="Base file name to store the feature map")
parser.add_argument("--scale-image", dest="scale", type=int, default=150,
help="Scales the longer dimension of the image down to this size.")
return parser
def load_input_map(label, input_folder):
combined_data = []
if not os.path.isdir(input_folder):
print ("The folder " + input_folder + " doesn't exist")
raise IOError
for root, dirs, files in os.walk(input_folder):
for filename in (x for x in files if x.endswith('.jpg')):
combined_data.append({'label': label, 'image': os.path.join(root, filename)})
return combined_data
class FeatureExtractor(object):
def extract_image_features(self, img):
kps = DenseDetector().detect(img)
kps, fvs = SIFTExtractor().compute(img, kps)
return fvs
def get_centroids(self, input_map, num_samples_to_fit=10):
kps_all = []
count = 0
cur_label = ''
for item in input_map:
if count >= num_samples_to_fit:
if cur_label != item['label']:
count = 0
else:
continue
count += 1
if count == num_samples_to_fit:
print ("Built centroids for", item['label'])
cur_label = item['label']
img = cv2.imread(item['image'])
img = resize_to_size(img, 150)
num_dims = 128
fvs = self.extract_image_features(img)
kps_all.extend(fvs)
kmeans, centroids = Quantizer().quantize(kps_all)
return kmeans, centroids
def get_feature_vector(self, img, kmeans, centroids):
return Quantizer().get_feature_vector(img, kmeans, centroids)
def extract_feature_map(input_map, kmeans, centroids):
feature_map = []
for item in input_map:
temp_dict = {}
temp_dict['label'] = item['label']
print ("Extracting features for", item['image'])
img = cv2.imread(item['image'])
img = resize_to_size(img, 150)
temp_dict['feature_vector'] = FeatureExtractor().get_feature_vector(
img, kmeans, centroids)
if temp_dict['feature_vector'] is not None:
feature_map.append(temp_dict)
return feature_map
class Quantizer(object):
def __init__(self, num_clusters=32):
self.num_dims = 128
self.extractor = SIFTExtractor()
self.num_clusters = num_clusters
self.num_retries = 10
def quantize(self, datapoints):
kmeans = KMeans(self.num_clusters,
n_init=max(self.num_retries, 1),
max_iter=10, tol=1.0)
res = kmeans.fit(datapoints)
centroids = res.cluster_centers_
return kmeans, centroids
def normalize(self, input_data):
sum_input = np.sum(input_data)
if sum_input > 0:
return input_data / sum_input
else:
return input_data
def get_feature_vector(self, img, kmeans, centroids):
kps = DenseDetector().detect(img)
kps, fvs = self.extractor.compute(img, kps)
labels = kmeans.predict(fvs)
fv = np.zeros(self.num_clusters)
for i, item in enumerate(fvs):
fv[labels[i]] += 1
fv_image = np.reshape(fv, ((1, fv.shape[0])))
return self.normalize(fv_image)
class DenseDetector(object):
def __init__(self, step_size=20, feature_scale=40, img_bound=20):
self.detector = cv2.xfeatures2d.SIFT_create("Dense")
self.detector.setInt("initXyStep", step_size)
self.detector.setInt("initFeatureScale", feature_scale)
self.detector.setInt("initImgBound", img_bound)
def detect(self, img):
return self.detector.detect(img)
class SIFTExtractor(object):
def compute(self, image, kps):
if image is None:
print ("Not a valid image")
raise TypeError
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
kps, des = cv2.SIFT().compute(gray_image, kps)
return kps, des
# Resize the shorter dimension to 'new_size'
# while maintaining the aspect ratio
def resize_to_size(input_image, new_size=150):
h, w = input_image.shape[0], input_image.shape[1]
ds_factor = new_size / float(h)
if w < h:
ds_factor = new_size / float(w)
new_size = (int(w * ds_factor), int(h * ds_factor))
return cv2.resize(input_image, new_size)
if __name__=='__main__':
args = build_arg_parser().parse_args()
input_map = []
for cls in args.cls:
assert len(cls) >= 2, "Format for classes is `<label> file`"
label = cls[0]
input_map += load_input_map(label, cls[1])
downsample_length = args.scale
# Building the codebook
print ("===== Building codebook =====")
kmeans, centroids = FeatureExtractor().get_centroids(input_map)
if args.codebook_file:
with open(args.codebook_file, 'w') as f:
pickle.dump((kmeans, centroids), f)
# Input data and labels
print ("===== Building feature map =====")
feature_map = extract_feature_map(input_map, kmeans, centroids)
if args.feature_map_file:
with open(args.feature_map_file, 'w') as f:
pickle.dump(feature_map, f)
I receive the following error:
Traceback (most recent call last):
File "create_features.py", line 164, in <module>
assert len(cls) >= 2, ("Format for classes is `<label> file`")
AssertionError: Format for classes is `<label> file`
Any idea of what could be wrong? I'm just following the instructions of 'OpenCV with Python by Example' of Prateek Joshi. Pages 494-526
Assertion are used to check a condition. If the condition isn't satisfied, it throes AssertionError. In your case, len(cls) >= 2 isn't satisfied. It means that len(cls) is smaller than 2. Apparently, cls is a list of arguments passed to the programm. And the first element of this list must be a label. And when you add argument (a file), you should specify a label for this file.
For example, if you choose a label name my_label, you must add file with my_label my_file.
when l run the following program l got this error :
originDataset = dataset.lmdbDataset(originPath, 'abc', *args)
TypeError: __init__() takes from 1 to 4 positional arguments but 9 were given
This error is relate to the second code source l presented below. it's strange because l don't have 9 argument. what's wrong with my code ?
import sys
origin_path = sys.path
sys.path.append("..")
import dataset
sys.path = origin_path
import lmdb
def writeCache(env, cache):
with env.begin(write=True) as txn:
for k, v in cache.iteritems():
txn.put(k, v)
def convert(originPath, outputPath):
args = [0] * 6
originDataset = dataset.lmdbDataset(originPath, 'abc', *args)
print('Origin dataset has %d samples' % len(originDataset))
labelStrList = []
for i in range(len(originDataset)):
label = originDataset.getLabel(i + 1)
labelStrList.append(label)
if i % 10000 == 0:
print(i)
lengthList = [len(s) for s in labelStrList]
items = zip(lengthList, range(len(labelStrList)))
items.sort(key=lambda item: item[0])
env = lmdb.open(outputPath, map_size=1099511627776)
cnt = 1
cache = {}
nSamples = len(items)
for i in range(nSamples):
imageKey = 'image-%09d' % cnt
labelKey = 'label-%09d' % cnt
origin_i = items[i][1]
img, label = originDataset[origin_i + 1]
cache[labelKey] = label
cache[imageKey] = img
if cnt % 1000 == 0 or cnt == nSamples:
writeCache(env, cache)
cache = {}
print('Written %d / %d' % (cnt, nSamples))
cnt += 1
nSamples = cnt - 1
cache['num-samples'] = str(nSamples)
writeCache(env, cache)
print('Convert dataset with %d samples' % nSamples)
if __name__ == "__main__":
convert('/share/datasets/scene_text/Synth90k/synth90k-val-lmdb', '/share/datasets/scene_text/Synth90k/synth90k-val-ordered-lmdb')
convert('/share/datasets/scene_text/Synth90k/synth90k-train-lmdb', '/share/datasets/scene_text/Synth90k/synth90k-train-ordered-lmdb')
which calls the following program :
#!/usr/bin/python
# encoding: utf-8
import random
import torch
from torch.utils.data import Dataset
from torch.utils.data import sampler
import torchvision.transforms as transforms
import lmdb
import six
import sys
from PIL import Image
import numpy as np
class lmdbDataset(Dataset):
def __init__(self, root=None, transform=None, target_transform=None):
self.env = lmdb.open(
root,
max_readers=1,
readonly=True,
lock=False,
readahead=False,
meminit=False)
if not self.env:
print('cannot creat lmdb from %s' % (root))
sys.exit(0)
with self.env.begin(write=False) as txn:
nSamples = int(txn.get('num-samples'))
self.nSamples = nSamples
self.transform = transform
self.target_transform = target_transform
def __len__(self):
return self.nSamples
def __getitem__(self, index):
assert index <= len(self), 'index range error'
index += 1
with self.env.begin(write=False) as txn:
img_key = 'image-%09d' % index
imgbuf = txn.get(img_key)
buf = six.BytesIO()
buf.write(imgbuf)
buf.seek(0)
try:
img = Image.open(buf).convert('L')
except IOError:
print('Corrupted image for %d' % index)
return self[index + 1]
if self.transform is not None:
img = self.transform(img)
label_key = 'label-%09d' % index
label = str(txn.get(label_key))
if self.target_transform is not None:
label = self.target_transform(label)
return (img, label)
class resizeNormalize(object):
def __init__(self, size, interpolation=Image.BILINEAR):
self.size = size
self.interpolation = interpolation
self.toTensor = transforms.ToTensor()
def __call__(self, img):
img = img.resize(self.size, self.interpolation)
img = self.toTensor(img)
img.sub_(0.5).div_(0.5)
return img
class randomSequentialSampler(sampler.Sampler):
def __init__(self, data_source, batch_size):
self.num_samples = len(data_source)
self.batch_size = batch_size
def __iter__(self):
n_batch = len(self) // self.batch_size
tail = len(self) % self.batch_size
index = torch.LongTensor(len(self)).fill_(0)
for i in range(n_batch):
random_start = random.randint(0, len(self) - self.batch_size)
batch_index = random_start + torch.range(0, self.batch_size - 1)
index[i * self.batch_size:(i + 1) * self.batch_size] = batch_index
# deal with tail
if tail:
random_start = random.randint(0, len(self) - self.batch_size)
tail_index = random_start + torch.range(0, tail - 1)
index[(i + 1) * self.batch_size:] = tail_index
return iter(index)
def __len__(self):
return self.num_samples
class alignCollate(object):
def __init__(self, imgH=32, imgW=128, keep_ratio=False, min_ratio=1):
self.imgH = imgH
self.imgW = imgW
self.keep_ratio = keep_ratio
self.min_ratio = min_ratio
def __call__(self, batch):
images, labels = zip(*batch)
imgH = self.imgH
imgW = self.imgW
if self.keep_ratio:
ratios = []
for image in images:
w, h = image.size
ratios.append(w / float(h))
ratios.sort()
max_ratio = ratios[-1]
imgW = int(np.floor(max_ratio * imgH))
imgW = max(imgH * self.min_ratio, imgW) # assure imgH >= imgW
transform = resizeNormalize((imgW, imgH))
images = [transform(image) for image in images]
images = torch.cat([t.unsqueeze(0) for t in images], 0)
return images, labels