Creating custom DependencyParser from scratch in Spacy 3 - python-3.x

I am trying to implement my own DependencyParser from scratch in Spacy 3. I create an empty model, create an empty DependencyParser, train it and save its configuration. But when I try to load my custom parser config again, I can only do it successfully if the model is empty. If I am using a non-empty model, then I keep getting this error - ValueError: could not broadcast input array from shape (106,64) into shape (27,64).
import spacy
import random
from spacy.tokens import Doc
from import Example
from spacy.pipeline import DependencyParser
from typing import List, Tuple
PARSER_CONFIG = 'parser.cfg'
('find a high paying job with no experience', {
'heads': [0, 4, 4, 4, 0, 7, 7, 4],
'deps': ['ROOT', '-', 'QUALITY', 'QUALITY', 'ACTIVITY', '-', 'QUALITY', 'ATTRIBUTE']
('find good workout classes near home', {
'heads': [0, 3, 3, 0, 5, 3],
def create_training_examples(training_data: List[Tuple]) -> List[Example]:
""" Create list of training examples """
examples = []
nlp = spacy.load('en_core_web_md')
for text, annotations in training_data:
print(f"{text} - {annotations}")
examples.append(Example.from_dict(nlp(text), annotations))
return examples
def save_parser_config(parser: DependencyParser):
print(f"Save parser config to '{PARSER_CONFIG}' ... ", end='')
def load_parser_config(parser: DependencyParser):
print(f"Load parser config from '{PARSER_CONFIG}' ... ", end='')
def main():
nlp = spacy.blank('en')
# Create new parser
parser = nlp.add_pipe('parser', first=True)
for text, annotations in TRAINING_DATA:
for label in annotations['deps']:
if label not in parser.labels:
print(f"Added labels: {parser.labels}")
examples = create_training_examples(TRAINING_DATA)
# Training
# NOTE: The 'lambda: examples' part is mandatory in Spacy 3 -
optimizer = nlp.initialize(lambda: examples)
print(f"Training ... ", end='')
for i in range(25):
print(f"{i} ", end='')
nlp.update(examples, sgd=optimizer)
print(f"... DONE")
# I can load parser config to blank model ...
nlp = spacy.blank('en')
parser = nlp.add_pipe('parser')
# ... but I cannot load parser config to already existing model
# Return -> ValueError: could not broadcast input array from shape (106,64) into shape (27,64)
# nlp = spacy.load('en_core_web_md')
# parser = nlp.get_pipe('parser')
print(f"Current pipeline is {nlp.meta['pipeline']}")
doc = nlp(u'find a high paid job with no degree')
print(f"Arcs: {[(w.text, w.dep_, w.head.text) for w in doc if w.dep_ != '-']}")
if __name__ == '__main__':
The custom parser itself is working as expected. You can test this by commenting out all the code from save_parser_config(parser) to load_parser_config(parser) (inclusive), and run the code again. You will see new labels are assigned as needed. This is why I think the root of the problem is the inability to load the parser configuration of an empty model into a non-empty model. But how to get around this?

I contacted the developers and this is what they answered -


Text Classification on a custom dataset with spacy v3

I am really struggling to make things work with the new spacy v3 version. The documentation is full. However, I am trying to run a training loop in a script.
(I am also not able to perform text classification training with CLI approach).
Data are publically available here.
import pandas as pd
from import Example
import random
TRAIN_DATA = pd.read_json('data.jsonl', lines = True)
nlp = spacy.load('en_core_web_sm')
config = {
"threshold": 0.5,
textcat = nlp.add_pipe("textcat", config=config, last=True)
label = TRAIN_DATA['label'].unique()
for label in label:
nlp = spacy.blank("en")
# Loop for 10 iterations
for itn in range(100):
# Shuffle the training data
losses = {}
TRAIN_DATA = TRAIN_DATA.sample(frac = 1)
# Batch the examples and iterate over them
for batch in spacy.util.minibatch(TRAIN_DATA.values, size=4):
texts = [nlp.make_doc(text) for text, entities in batch]
annotations = [{"cats": entities} for text, entities in batch]
# uses an example object rather than text/annotation tuple
examples = [Example.from_dict(a)]
nlp.update(examples, losses=losses)
if itn % 20 == 0:

How to set the label names when using the Huggingface TextClassificationPipeline?

I am using a fine-tuned Huggingface model (on my company data) with the TextClassificationPipeline to make class predictions. Now the labels that this Pipeline predicts defaults to LABEL_0, LABEL_1 and so on. Is there a way to supply the label mappings to the TextClassificationPipeline object so that the output may reflect the same?
Sample Code:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # or any {'0', '1', '2'}
from transformers import TextClassificationPipeline, TFAutoModelForSequenceClassification, AutoTokenizer
MODEL_DIR = "path\to\my\fine-tuned\model"
# Feature extraction pipeline
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
pipeline = TextClassificationPipeline(model=model,
result = pipeline("It was a good watch. But a little boring.")[0]
In [2]: result
Out[2]: {'label': 'LABEL_1', 'score': 0.8864616751670837}
The simplest way is to add such a mapping is to edit the config.json of the model to contain: id2label field as below:
"_name_or_path": "distilbert-base-uncased",
"activation": "gelu",
"architectures": [
"id2label": [
"attention_dropout": 0.1,
A in-code way to set this mapping is by adding the id2label param in the from_pretrained call as below:
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_DIR, id2label={0: 'negative', 1: 'positive'})
Here is the Github Issue I raised for this to get added into the Documentation of transformers.XForSequenceClassification.

Torchtext 0.7 shows Field is being deprecated. What is the alternative?

Looks like the previous paradigm of declaring Fields, Examples and using BucketIterator is deprecated and will move to legacy in 0.8. However, I don't seem to be able to find an example of the new paradigm for custom datasets (as in, not the ones included in torch.datasets) that doesn't use Field. Can anyone point me at an up-to-date example?
Reference for deprecation:
It took me a little while to find the solution myself. The new paradigm is like so for prebuilt datasets:
from torchtext.experimental.datasets import AG_NEWS
train, test = AG_NEWS(ngrams=3)
or like so for custom built datasets:
from import DataLoader
def collate_fn(batch):
texts, labels = [], []
for label, txt in batch:
return texts, labels
dataloader = DataLoader(train, batch_size=8, collate_fn=collate_fn)
for idx, (texts, labels) in enumerate(dataloader):
print(idx, texts, labels)
I've copied the examples from the Source
Browsing through torchtext's GitHub repo I stumbled over the README in the legacy directory, which is not documented in the official docs. The README links a GitHub issue that explains the rationale behind the change as well as a migration guide.
If you just want to keep your existing code running with torchtext 0.9.0, where the deprecated classes have been moved to the legacy module, you have to adjust your imports:
# from import Field, TabularDataset
from import Field, TabularDataset
Alternatively, you can import the whole torchtext.legacy module as torchtext as suggested by the README:
import torchtext.legacy as torchtext
There is a post regarding this. Instead of the deprecated Field and BucketIterator classes, it uses the TextClassificationDataset along with the collator and other preprocessing. It reads a txt file and builds a dataset, followed by a model. Inside the post, there is a link to a complete working notebook. The post is at: But you need the 'dev' (or nightly build) of PyTorch for it to work.
From the link above:
After tokenization and building vocabulary, you can build the dataset as follows
def data_to_dataset(data, tokenizer, vocab):
data = [(text, label) for (text, label) in data]
text_transform = sequential_transforms(tokenizer.tokenize,
label_transform = sequential_transforms(lambda x: 1 if x =='1' else (0 if x =='0' else x),
transforms = (text_transform, label_transform)
dataset = TextClassificationDataset(data, vocab, transforms)
return dataset
The collator is as follows:
def __init__(self, pad_idx):
self.pad_idx = pad_idx
def collate(self, batch):
text, labels = zip(*batch)
labels = torch.LongTensor(labels)
text = nn.utils.rnn.pad_sequence(text, padding_value=self.pad_idx, batch_first=True)
return text, labels
Then, you can build the dataloader with the typical using the collate_fn argument.
Well it seems like pipeline could be like that:
import torchtext as TT
import torch
from collections import Counter
from torchtext.vocab import Vocab
# read the data
with open('text_data.txt','r') as f:
data = f.readlines()
with open('labels.txt', 'r') as f:
labels = f.readlines()
tokenizer ='spacy', 'en') # can remove 'spacy' and use a simple built-in tokenizer
train_iter = zip(labels, data)
counter = Counter()
for (label, line) in train_iter:
vocab = TT.vocab.Vocab(counter, min_freq=1)
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
# this is data-specific - adapt for your data
label_pipeline = lambda x: 1 if x == 'positive\n' else 0
class TextData(
very basic dataset for processing text data
def __init__(self, labels, text):
super(TextData, self).__init__()
self.labels = labels
self.text = text
def __getitem__(self, index):
return self.labels[index], self.text[index]
def __len__(self):
return len(self.labels)
def tokenize_batch(batch, max_len=200):
tokenizer to use in DataLoader
takes a text batch of text dataset and produces a tensor batch, converting text and labels though tokenizer, labeler
tokenizer is a global function text_pipeline
labeler is a global function label_pipeline
max_len is a fixed len size, if text is less than max_len it is padded with ones (pad number)
if text is larger that max_len it is truncated but from the end of the string
labels_list, text_list = [], []
for _label, _text in batch:
text_holder = torch.ones(max_len, dtype=torch.int32) # fixed size tensor of max_len
processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int32)
pos = min(200, len(processed_text))
text_holder[-pos:] = processed_text[-pos:]
return torch.FloatTensor(labels_list),, dim=0)
train_dataset = TextData(labels, data)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=False, collate_fn=tokenize_batch)
lbl, txt = iter(train_loader).next()

Convert NER SpaCy format to IOB format

I have data which is already labelled in SpaCy format. For example:
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]})
But I want to try training it with any other NER model, such as BERT-NER, which requires IOB tagging instead. Is there any conversion code from SpaCy data format to IOB?
This is closely related to and mostly copied from, see the notes in the comments there, too:
import spacy
from import biluo_tags_from_offsets
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
nlp = spacy.load('en_core_web_sm')
docs = []
for text, annot in TRAIN_DATA:
doc = nlp(text)
tags = biluo_tags_from_offsets(doc, annot['entities'])
# then convert L->I and U->B to have IOB tags for the tokens in the doc
I am afraid, you will have to write your own conversion because IOB encoding depends on what tokenization will the pre-trained representation model (BERT, RoBERTa or whatever pre-trained model of your choice) uses.
The SpaCy format specifies the character span of the entity, i.e.
"Who is Shaka Khan?"[7:17]
will return "Shaka Khan". You need to match that to tokens used by the pre-trained model.
Here are examples of how different models tokenize the example sentence when you used Huggingface's Transformers.
BERT: ['Who', 'is', 'S', '##hak', '##a', 'Khan', '?']
RoBERTa: ['Who', '_is', '_Sh', 'aka', '_Khan', '?']
XLNet: ['▁Who', '▁is', '▁Shak', 'a', '▁Khan', '?']
When knowing how the tokenizer work, you can implement the conversion. Something like this can work for BERT tokenization.
entities = [(7, 17, "PERSON")]}
tokenized = ['Who', 'is', 'S', '##hak', '##a', 'Khan', '?']
cur_start = 0
state = "O" # Outside
tags = []
for token in tokenized:
# Deal with BERT's way of encoding spaces
if token.startswith("##"):
token = token[2:]
token = " " + token
cur_end = cur_start + len(token)
if state == "O" and cur_start < entities[0][0] < cur_end:
tags.append("B-" + entitites[0][2])
state = "I-" + entitites[0][2]
elif state.startswith("I-") and cur_start < entities[0][1] < cur_end:
state = "O"
cur_start = cur_end
Note that the snippet would break if one BERT token would contain the end of one entity and the start of the following one. The tokenizer also does not distinguish how many spaces (or other whitespaces) there were in the original string, this is a potential source of errors as well.
First You need to convert your annotated json file to csv.
Then you can run the below code to convert into spaCy V2 Binary format
df = pd.read_csv('SC_CSV.csv')
l1 = []
l2 = []
for i in range(0, len(df['ner'])):
TRAIN_DATA = list(zip(l1, l2))
Now the TRAIN_DATA in spaCy V2 format
This helps to convert the file from your old Spacy v2 formats to the brand new Spacy v3 format.
import pandas as pd
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object
for text, annot in tqdm(TRAIN_DATA): # data in previous format
doc = nlp.make_doc(text) # create doc object from text
ents = []
for start, end, label in annot["entities"]: # add character indexes
span = doc.char_span(start, end, label=label, alignment_mode="contract")
if span is None:
print("Skipping entity")
doc.ents = ents # label the text with the ents
db.to_disk("./train.spacy") # save the docbin object
I have faced this kind of problem.
what i did is transforming the data to spacy binary then I load the data from docbin object using this code.
import spacy
from spacy.tokens import DocBin
then this code may help you to extract the iob format from it.
for elem in Documents[0]:
else :
here is the example of my output :
عبرت O
الديناميكية B - POLITIQUE
النسوية I - POLITIQUE
التي O
تأسست O
بعد O
25 O
جويلية O
2021 O
عن O
رفضها O
القطعي O
لمشروع O
المرسوم B - POLITIQUE
عدد O
88 O
لسنة O
import spacy
from import biluo_tags_from_offsets
data = data
nlp = spacy.blank("en")
for text, labels in data:
doc = nlp("read our spacy format data here")
ents = []
for start, end, label in labels["entities"]:
ents.append(doc.char_span(start, end, label))
doc.ents = ents
for tok in doc:
label = tok.ent_iob_
if tok.ent_iob_ != "O":
label += '-' + tok.ent_type_
print(tok, label, sep="\t")
if getting none-type error do add try block depending on your dataset or clean your dataset.

scikit-learn - Using a single string with RandomForestClassifier.predict()?

I'm an sklearn dummy... I'm trying to predict the label for a given string from a RandomForestClassifier() fitted with text, labels.
It's obvious I don't know how to use predict() with a single string. The reason I'm using reshape() is because I got this error some time ago "Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample."
How can I predict the label of a single text string?
The script:
#!/usr/bin/env python
''' Read a txt file consisting of '<label>: <long string of text>'
to use as a model for predicting the label for a string
from argparse import ArgumentParser
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
def main(args):
args: Arguments obtained by _Get_Args()
print('Loading data...')
# Load data from args.txtfile and split the lines into
# two lists (labels, texts).
data = open(args.txtfile).readlines()
labels, texts = ([], [])
for line in data:
label, text = line.split(': ', 1)
# Print a list of unique labels
print(json.dumps(list(set(labels)), indent=4))
# Instantiate a CountVectorizer class and git the texts
# and labels into it.
cv = CountVectorizer(
matrix = cv.fit_transform(texts)
encoder = LabelEncoder()
labels = encoder.fit_transform(labels)
rf = RandomForestClassifier(), labels)
# Try to predict the label for args.string.
prediction = Predict_Label(args.string, cv, rf)
def Predict_Label(string, cv, rf):
string: str() - A string of text
cv: The CountVectorizer class
rf: The RandomForestClassifier class
matrix = cv.fit_transform([string])
matrix = matrix.reshape(1, -1)
prediction = rf.predict(matrix)
except Exception as E:
return prediction
def _Get_Args():
parser = ArgumentParser(description='Learn labels from text')
parser.add_argument('-t', '--txtfile', required=True)
parser.add_argument('-s', '--string', required=True)
return parser.parse_args()
if __name__ == '__main__':
args = _Get_Args()
The actual learning data text file is 43663 lines long but a sample is in small_list.txt which consists of lines each in the format: <label>: <long text string>
The error is noted in the Exception output:
$ ./ -t small_list.txt -s 'This is a string that might have something to do with phishing or fraud'
Loading data...
"Vulnerabilities__MSSQL Browsing Service",
"Fraud__Copyright/Trademark Infringement",
"Attacks and Reconnaissance__Web Attacks",
"Vulnerabilities__Vulnerable SMB",
"Internal Report__SBL Notify",
"Objectionable Content__Russian Federation Objectionable Material",
"Malicious Code/Traffic__Malicious URL",
"Spam__Marketing Spam",
"Attacks and Reconnaissance__Scanning",
"Malicious Code/Traffic__Unknown",
"Attacks and Reconnaissance__SSH Brute Force",
"Spam__URL in Spam",
"Vulnerabilities__Vulnerable Open Memcached",
"Malicious Code/Traffic__Sinkhole",
"Attacks and Reconnaissance__SMTP Brute Force",
"Illegal content__Child Pornography"
Number of features of the model must match the input. Model n_features is 2070 and input n_features is 3
You need to get the vocabulary of the first CountVectorizer (cv) and use to transform the new single text before predict.
cv = CountVectorizer(
matrix = cv.fit_transform(texts)
encoder = LabelEncoder()
labels = encoder.fit_transform(labels)
rf = RandomForestClassifier(), labels)
# Try to predict the label for args.string.
cv_new = CountVectorizer(
prediction = Predict_Label(args.string, cv_new, rf)
