I was replicating the code which is fine-tuned for Domain Adaptation. This is the main link to the post for more details:
(https://towardsdatascience.com/fine-tuning-for-domain-adaptation-in-nlp-c47def356fd6)
The code is as such:
!pip install -q transformers
!pip install -q datasets
import multiprocessing
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import transformers
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer, AutoConfig
from transformers import BertForMaskedLM, DistilBertForMaskedLM
from transformers import BertTokenizer, DistilBertTokenizer
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from tokenizers import BertWordPieceTokenizer
# HYPERPARAMS
SEED_SPLIT = 0
SEED_TRAIN = 0
MAX_SEQ_LEN = 128
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 16
LEARNING_RATE = 2e-5
LR_WARMUP_STEPS = 100
WEIGHT_DECAY = 0.01
# load data
dtf_mlm = pd.read_csv('data/jigsaw_train.csv', nrows=1000)
dtf_mlm = dtf_mlm[dtf_mlm["target"] < 0.5]
dtf_mlm = dtf_mlm.rename(columns={"comment_text": "text"})
# Train/Valid Split
df_train, df_valid = train_test_split(
dtf_mlm, test_size=0.15, random_state=SEED_SPLIT
)
len(df_train), len(df_valid)
# Convert to Dataset object
train_dataset = Dataset.from_pandas(df_train[['text']].dropna())
valid_dataset = Dataset.from_pandas(df_valid[['text']].dropna())
#Model Selection Part
MODEL = 'bert'
bert_type = 'bert-base-cased'
TokenizerClass = BertTokenizer
ModelClass = BertForMaskedLM
#Tokenization Part
tokenizer = TokenizerClass.from_pretrained(
bert_type, use_fast=True, do_lower_case=False, max_len=MAX_SEQ_LEN
)
model = ModelClass.from_pretrained(bert_type)
def tokenize_function(row):
return tokenizer(
row['text'],
padding='max_length',
truncation=True,
max_length=MAX_SEQ_LEN,
return_special_tokens_mask=True)
column_names = train_dataset.column_names
train_dataset = train_dataset.map(
tokenize_function,
batched=True,
num_proc=multiprocessing.cpu_count(),
remove_columns=column_names,
)
valid_dataset = valid_dataset.map(
tokenize_function,
batched=True,
num_proc=multiprocessing.cpu_count(),
remove_columns=column_names,
)
#Training and Model Saving Part
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
steps_per_epoch = int(len(train_dataset) / TRAIN_BATCH_SIZE)
training_args = TrainingArguments(
output_dir='./bert-news',
logging_dir='./LMlogs',
num_train_epochs=2,
do_train=True,
do_eval=True,
per_device_train_batch_size=TRAIN_BATCH_SIZE,
per_device_eval_batch_size=EVAL_BATCH_SIZE,
warmup_steps=LR_WARMUP_STEPS,
save_steps=steps_per_epoch,
save_total_limit=3,
weight_decay=WEIGHT_DECAY,
learning_rate=LEARNING_RATE,
evaluation_strategy='epoch',
save_strategy='epoch',
load_best_model_at_end=True,
metric_for_best_model='loss',
greater_is_better=False,
seed=SEED_TRAIN
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
eval_dataset=valid_dataset,
tokenizer=tokenizer,
)
trainer.train()
trainer.save_model("SavedModel/TestModel") #save your custom model
And this is the GPU that I am using:
I want to use the GPU for training the model on about 1.5 million comments.
I tried doing this:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#Setting the tokenizer and the model
tokenizer = TokenizerClass.from_pretrained(
bert_type, use_fast=True, do_lower_case=False, max_len=MAX_SEQ_LEN
)
model = ModelClass.from_pretrained(bert_type).to(device)
But I am unsure how to send the inputs and tokens to the GPU.
Feel free to give your advice, and I don't owe this code, shout out to Marcello Politi. Thanks!
After you load the dataset you should add:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
train_dataset = train_dataset.to(device)
Related
I am training a NLP Hugging face model in vertex-ai with custom image.
The same code works in local machine.
Here is my code and the error.
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import transformers as tr
from sentence_transformers import SentenceTransformer
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM
from transformers import AdamW
from transformers import AutoTokenizer
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup,BertForMaskedLM
from transformers import DataCollatorForLanguageModeling
from scipy.special import softmax
import scipy
import random
import pickle
import os
print("package imported completed")
os.environ['TRANSFORMERS_OFFLINE']='1'
os.environ['HF_MLFLOW_LOG_ARTIFACTS']='TRUE'
print("env setup completed")
print( tr.__version__)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("Using", device)
torch.backends.cudnn.deterministic = True
tr.trainer_utils.set_seed(0)
print("here")
tokenizer = tr.XLMRobertaTokenizer.from_pretrained("xlm-roberta-large",local_files_only=True)
model = tr.XLMRobertaForMaskedLM.from_pretrained("xlm-roberta-large", return_dict=True,local_files_only=True)
model.to(device)
print("Model loaded successfully")
df=pd.read_csv("gs://****bucket***/data.csv")
print("read csv")
# ,engine='openpyxl',sheet_name="master_data"
train_df=df.text.tolist()
print(len(train_df))
train_df=list(set(train_df))
train_df = [x for x in train_df if str(x) != 'nan']
train_df=train_df[:50]
print("Length of training data is \n ",len(train_df))
print("DATA LOADED successfully")
train_encodings = tokenizer(train_df, truncation=True, padding=True, max_length=512, return_tensors="pt")
print("encoding done")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
print("data collector done")
class SEDataset(torch.utils.data.Dataset):
def __init__(self, encodings):
self.encodings = encodings
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
return item
def __len__(self):
return len(self.encodings["attention_mask"])
train_data = SEDataset(train_encodings)
print("train data created")
training_args = tr.TrainingArguments(
output_dir='gs://****bucket***/results_mlm_exp1',
overwrite_output_dir=True,
num_train_epochs=2,
per_device_train_batch_size=4,
# per_device_train_batch_size
# per_gpu_train_batch_size
prediction_loss_only=True
# ,save_strategy="epoch"
# ,run_name="MLM_Exp1"
,learning_rate=2e-5
# logging_dir='gs://****bucket***/logs_mlm_exp1', # directory for storing logs
# logging_steps=32000,
)
trainer = tr.Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_data,
)
print("training to start")
trainer.train()
print("model training finished")
trainer.save_model("gs://****bucket***/model_mlm_exp1")
print("training finished")
The error that I get is:
None INFO train data created
None INFO training to start
None ERROR 0%| | 0/8 [00:00<?, ?it/s]train.py:70: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
None ERROR item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
None ERROR /opt/conda/lib/python3.7/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
None ERROR warnings.warn('Was asked to gather along dimension 0, but all '
/var/sitecustomize/sitecustomize.py INFO None
None ERROR 0%| | 0/8 [00:09<?, ?it/s]
Most of them are warning but still my code stops with error.
import pandas as pd
import matplotlib.pyplot as plt
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
data = pd.read_csv("./emails.csv")
print(data.head())
data = data[['text','email_sentiment']]
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
print(data.head())
max_fatures = 50000
max_seq_length = 250
tokenizer = Tokenizer(num_words=max_fatures,filters='!"#$%&()*+,-./:;<=>?#[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(data['text'].values)
word_index = tokenizer.word_index
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X,maxlen=max_seq_length)
Y= pd.get_dummies(data['email_sentiment']).values
X_train,Y_train = train_test_split(X,Y, test_size = 0.10, random_state = 42)
embedding_vector_length = 100
lstm_out= 196
model = Sequential()
model.add(Embedding(max_fatures, embedding_vector_length, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())
Error occured:
X_train,Y_train = train_test_split(X,Y, test_size = 0.10, random_state = 42)
ValueError: too many values to unpack (expected 2)
Unable to train X_train with the dataset due to value errors. The X_train value consist of various emails which are categorized to postive negative and neutral sentiment based on LSTM classes 3
I am trying to embed some documents including a couple of sentences using huggingface transformers models. I have multi-gpu single-node and I want to do embedding parallel and distributed in all 8 gpus. I tried to use pytorch DistributedDataParallel, but I think all sentences are sending to all GPUs and for all sentences, it is returning one tensor. this is a sample code:
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import argparse
import os
from transformers import AlbertTokenizer, AlbertModel
import numpy
from tqdm import tqdm
from torch.utils.data import DataLoader,TensorDataset
def parse_args():
parse = argparse.ArgumentParser()
parse.add_argument(
'--local_rank',
dest = 'local_rank',
type = int,
default = 0,
)
parse.add_argument("--gpu", type=str, default='None',
help="choose gpu device.")
return parse.parse_args()
def train():
args = parse_args()
if not args.gpu == 'None':
device = torch.device("cuda")
os.environ["CUDA_VISIBLE_DEVICES"]=args.gpu
else:
device = torch.device("cpu")
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(
backend='nccl',
init_method='env://',
)
tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2')
sentences=['I love tea',
'He hates tea',
'We love tea',
'python coder',
'geeksforgeeks',
'coder in geeksforgeeks']
sentence_tokens = []
for sent in (sentences):
token_id = tokenizer.encode(sent, max_length=128, add_special_tokens=True, pad_to_max_length=True)
sentence_tokens.append(token_id)
original_sentences = torch.tensor(sentence_tokens)
train_dataset = TensorDataset(original_sentences)
#setup training sampler
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset,num_replicas=len(sentences))
#setup training data loader with the train sampler setup
train_dataloader = DataLoader(train_dataset, batch_size=16,sampler=train_sampler, shuffle=False)
model = AlbertModel.from_pretrained('albert-xxlarge-v2', return_dict=True)
model = model.to(device)
model = nn.parallel.DistributedDataParallel(model,
device_ids = [args.local_rank, ],
output_device = args.local_rank,\
find_unused_parameters=True
)
for batch in (train_dataloader):
batch_input_tensors = batch[0].to('cuda')
outputs = model(batch_input_tensors)
last_hidden_states = outputs.last_hidden_state
average= torch.mean(last_hidden_states,dim=1)
if __name__ == "__main__":
train()
all of sentences are sending to all 8 GPUs and output as last_hidden_states is only one tensor. I got the average of tensor elements because I thought at the end they should be same but they aren't.
how can do it distributed and sentences distribute to GPUs and embed over there? and finally for each sentence or for my final case each Doc I have one tensor as feature vector?
thanks
I am loading my pre-trained keras model and then trying to parallelize a large number of input data using dask? Unfortunately, I'm running into some issues with this relating to how I'm creating my dask array. Any guidance would be greatly appreciated!
Setup:
First I cloned from this repo https://github.com/sanchit2843/dlworkshop.git
Reproducible Code Example:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from keras.models import load_model
import keras
from keras.models import Sequential
from keras.layers import Dense
from dask.distributed import Client
import warnings
import dask.array as DaskArray
warnings.filterwarnings('ignore')
dataset = pd.read_csv('data/train.csv')
X = dataset.drop(['price_range'], axis=1).values
y = dataset[['price_range']].values
# scale data
sc = StandardScaler()
X = sc.fit_transform(X)
ohe = OneHotEncoder()
y = ohe.fit_transform(y).toarray()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
# Neural network
model = Sequential()
model.add(Dense(16, input_dim=20, activation="relu"))
model.add(Dense(12, activation="relu"))
model.add(Dense(4, activation="softmax"))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=100, batch_size=64)
# Use dask
client = Client()
def load_and_predict(input_data_chunk):
def contrastive_loss(y_true, y_pred):
margin = 1
square_pred = K.square(y_pred)
margin_square = K.square(K.maximum(margin - y_pred, 0))
return K.mean(y_true * square_pred + (1 - y_true) * margin_square)
mlflow.set_tracking_uri('<uri>')
mlflow.set_experiment('clean_parties_ml')
runs = mlflow.search_runs()
artifact_uri = runs.loc[runs['start_time'].idxmax()]['artifact_uri']
model = mlflow.keras.load_model(artifact_uri + '/model', custom_objects={'contrastive_loss': contrastive_loss})
y_pred = model.predict(input_data_chunk)
return y_pred
da_input_data = da.from_array(X_test, chunks=(100, None))
prediction_results = da_input_data.map_blocks(load_and_predict, dtype=X_test.dtype).compute()
The Error I'm receiving:
AttributeError: '_thread._local' object has no attribute 'value'
Keras/Tensorflow don't play nicely with other threaded systems. There is an ongoing issue on this topic here: https://github.com/dask/dask-examples/issues/35
I'm doing the "Hello world" in machine learning, using the Iris dataset. I already have an acceptable result for the entry of this model, I am using 80% of the information to train it and the remaining 20% to do the validation. I am using 6 prediction algorithms, which work well.
but I have a problem, how can I insert new information so that it is analyzed? How do I insert the characteristics of a flower and tell me the type of iris it is? Either: Iris-setosa, Iris-versicolor or Iris-virginica?
# Load libraries
import pandas
from pandas.plotting import scatter_matrix
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# Load dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names)
#######Evaluate Some Algorithms########
#Create a Validation Dataset
# Split-out validation dataset
array = dataset.values
X = array[:,0:4]
Y = array[:,4]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
########Build Models########
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
########Make Predictions########
print('######## Make Predictions ########')
# Make predictions on validation dataset
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
I think you can follow this other post to save your model, and after you can load him and pass new data and make some predictions.
Remember to set the data to same input shape as used during training.
import cPickle
# save the classifier
with open('my_dumped_classifier.pkl', 'wb') as fid:
cPickle.dump(gnb, fid)
# load it again
with open('my_dumped_classifier.pkl', 'rb') as fid:
gnb_loaded = cPickle.load(fid)
# make predictions