Simplify large-data processing script

Simplify large-data processing script - python-3.x

I am trying to do the following but it takes to much time.
Can someone please suggest a quicker way of doing this
f = open('answer.csv','w')
f.write('Datetime,0: Vm,0: Va,1: Vm,1: Va,2: Vm,2: Va,3: Vm,3: Va,4: Vm,4: Va,5: Vm,5: Va,6: Vm,6: Va,7: Vm,7: Va,8: Vm,8: Va,9: Vm,9: Va,10: Vm,10: Va,11: Vm,11: Va,12: Vm,12: Va,13: Vm,13: Va\n')
# 'n' is around 8000000
# 'PQ_data' is a pandas DataFrame with more than n rows
# 'class' is a python class object with some functions in it
for i in range(n):
p = []
q = []
for j in range(1,14):
if j<=10:
p.append(PQ_data['{} P'.format(j)][i])
q.append(PQ_data['{} Q'.format(j)][i])
else:
p.append(0)
q.append(0)
class.do_something(p,q)
vm = class.get_Vm().tolist()
va = class.get_Va().tolist()
# above methods return 14 length lists.
# PQ_data.index has datetime values
f.write('{}'.format(PQ_data.index[i]))
for j in range(len(vm)):
f.write(',{},{}'.format(vm[j],va[j]))
f.write('\n')
f.close()

Try this. If not, you might need to throw multiprocessing at it
import csv
import itertools
with open('answer.csv','w') as fout:
outfile = csv.writer(fout)
outfile.writerow(['Datetime', '0: Vm', '0: Va', '1: Vm', '1: Va', '2: Vm', '2: Va', '3: Vm', '3: Va', '4: Vm', '4: Va', '5: Vm', '5: Va', '6: Vm', '6: Va', '7: Vm', '7: Va', '8: Vm', '8: Va', '9: Vm', '9: Va', '10: Vm', '10: Va', '11: Vm', '11: Va', '12: Vm', '12: Va', '13: Vm', '13: Va'])
for i in range(n):
p = [PQ_data['{} P'.format(j)][i] for j in range(1,11)] + [0]*3
q = [PQ_data['{} Q'.format(j)][i] for j in range(1,11)] + [0]*3
class.do_something(p,q)
vm = class.get_Vm().tolist()
va = class.get_Va().tolist()
row = itertools.chain([PQ_data.index[i]], itertools.chain.from_iterable((vm[j],va[j]) for j in range(len(vm))))
outfile.writerow(row)

Related

HugginFace dataset error: RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.HalfTensor) should be the same or

I have taken code from many sources regarding Common Voice dataset. The only modifications I did was to change the language from Turkish to Persian.
I try to run the codes. However, I encounter this error when the line trainer.train() runs:
RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.HalfTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor
This is my code. You can copy paste it in Google Colab and run it (where I have run):
!pip install datasets==1.13.3
!pip install transformers==4.11.3
!pip install huggingface_hub==0.0.19
!pip install torchaudio
!pip install librosa
!pip install jiwer
!apt install git-lfs
!pip install hazm
!pip install pydub
!pip install pythainlp
import os
import re
#from typing import List, Dict, Tuple
import pandas as pd
from scipy.io import wavfile
from pythainlp.tokenize import word_tokenize
#from spell_correction import correct_sentence
import matplotlib.pyplot as plt
import numpy as np
from tqdm.auto import tqdm
from pydub import AudioSegment
from pythainlp.tokenize import word_tokenize, syllable_tokenize
from datasets import load_dataset, load_from_disk, load_metric
import hazm
import string
import torch
import os
#os.environ['CUDA_VISIBLE_DEVICES']='2, 3'
torch.cuda.empty_cache()
#print(torch.cuda.memory_summary(device=None, abbreviated=False))
print(torch.cuda.is_available())
_normalizer = hazm.Normalizer()
chars_to_ignore = [
",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
"#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬",'ٔ', ",", "?",
".", "!", "-", ";", ":",'"',"“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„',
'ā', 'š',
# "ء",
]
# In case of farsi
chars_to_ignore = chars_to_ignore + list(string.ascii_lowercase + string.digits)
chars_to_mapping = {
'ك': 'ک', 'دِ': 'د', 'بِ': 'ب', 'زِ': 'ز', 'ذِ': 'ذ', 'شِ': 'ش', 'سِ': 'س', 'ى': 'ی',
'ي': 'ی', 'أ': 'ا', 'ؤ': 'و', "ے": "ی", "ۀ": "ه", "ﭘ": "پ", "ﮐ": "ک", "ﯽ": "ی",
"ﺎ": "ا", "ﺑ": "ب", "ﺘ": "ت", "ﺧ": "خ", "ﺩ": "د", "ﺱ": "س", "ﻀ": "ض", "ﻌ": "ع",
"ﻟ": "ل", "ﻡ": "م", "ﻢ": "م", "ﻪ": "ه", "ﻮ": "و", 'ﺍ': "ا", 'ة': "ه",
'ﯾ': "ی", 'ﯿ': "ی", 'ﺒ': "ب", 'ﺖ': "ت", 'ﺪ': "د", 'ﺮ': "ر", 'ﺴ': "س", 'ﺷ': "ش",
'ﺸ': "ش", 'ﻋ': "ع", 'ﻤ': "م", 'ﻥ': "ن", 'ﻧ': "ن", 'ﻭ': "و", 'ﺭ': "ر", "ﮔ": "گ",
# "ها": " ها", "ئ": "ی",
"۱۴ام": "۱۴ ام",
"a": " ای ", "b": " بی ", "c": " سی ", "d": " دی ", "e": " ایی ", "f": " اف ",
"g": " جی ", "h": " اچ ", "i": " آی ", "j": " جی ", "k": " کی ", "l": " ال ",
"m": " ام ", "n": " ان ", "o": " او ", "p": " پی ", "q": " کیو ", "r": " آر ",
"s": " اس ", "t": " تی ", "u": " یو ", "v": " وی ", "w": " دبلیو ", "x": " اکس ",
"y": " وای ", "z": " زد ",
"\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
}
def multiple_replace(text, chars_to_mapping):
pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
def remove_special_characters(text, chars_to_ignore_regex):
text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
return text
def normalizer(text, chars_to_ignore=chars_to_ignore, chars_to_mapping=chars_to_mapping):
chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
text = text.lower().strip()
text = _normalizer.normalize(text)
text = multiple_replace(text, chars_to_mapping)
text = remove_special_characters(text, chars_to_ignore_regex)
text = re.sub(" +", " ", text)
_text = []
for word in text.split():
try:
word = int(word)
_text.append(words(word))
except:
_text.append(word)
text = " ".join(_text) + " "
text = text.strip()
if not len(text) > 0:
return None
return text + " "
data_dir = "cv-corpus-9.0-2022-04-27/fa"
from datasets import load_dataset, load_metric, Audio
common_voice_train = load_dataset("common_voice", "fa", split="train")
common_voice_train = common_voice_train.select(range(500))
common_voice_dev = load_dataset("common_voice", "fa", split="validation")
common_voice_dev = common_voice_dev.select(range(50))
common_voice_test = load_dataset("common_voice", "fa", split="test")
common_voice_test = common_voice_test.select(range(50))
print(common_voice_train)
print(common_voice_dev)
print(common_voice_test)
from datasets import ClassLabel
import random
import pandas as pd
#from IPython.display import display, HTML
def show_random_elements(dataset, num_examples=10):
assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
picks = []
for _ in range(num_examples):
pick = random.randint(0, len(dataset)-1)
while pick in picks:
pick = random.randint(0, len(dataset)-1)
picks.append(pick)
df = pd.DataFrame(dataset[picks])
print(df.head())
#show_random_elements(common_voice_train.remove_columns(["path"]), num_examples=20)
def normalizer(batch, chars_to_ignore=chars_to_ignore, chars_to_mapping=chars_to_mapping):
chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
text = batch["sentence"].lower().strip()
text = _normalizer.normalize(text)
text = multiple_replace(text, chars_to_mapping)
text = remove_special_characters(text, chars_to_ignore_regex)
text = re.sub(" +", " ", text)
_text = []
for word in text.split():
try:
word = int(word)
_text.append(words(word))
except:
_text.append(word)
text = " ".join(_text) + " "
text = text.strip()
if not len(text) > 0:
return None
if len(text) >= 32:
text = text[:30]
batch["sentence"] = text
return batch
#print(common_voice_train[0]["sentence"])
#print(common_voice_dev[0]["sentence"])
#print(common_voice_test[0]["sentence"])
common_voice_train = common_voice_train.map(normalizer, fn_kwargs={"chars_to_ignore": chars_to_ignore, "chars_to_mapping": chars_to_mapping})
common_voice_dev = common_voice_dev.map(normalizer, fn_kwargs={"chars_to_ignore": chars_to_ignore, "chars_to_mapping": chars_to_mapping})
common_voice_test = common_voice_test.map(normalizer, fn_kwargs={"chars_to_ignore": chars_to_ignore, "chars_to_mapping": chars_to_mapping})
#print(common_voice_train[0]["sentence"])
#print(common_voice_dev[0]["sentence"])
#print(common_voice_test[0]["sentence"])
def extract_all_chars(batch):
all_text = " ".join(batch["sentence"])
vocab = list(set(all_text))
return {"vocab": [vocab], "all_text": [all_text]}
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=4, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_dev = common_voice_dev.map(extract_all_chars, batched=True, batch_size=4, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=4, keep_in_memory=True, remove_columns=common_voice_test.column_names)
vocab_list = list(sorted(set(vocab_train["vocab"][0]) | set(vocab_dev["vocab"][0]) | set(vocab_test["vocab"][0])))
vocab_list = [vocab for vocab in vocab_list if vocab not in [" ", "\u0307"]]
print(len(vocab_list))
print(vocab_list)
vocab_list = list(sorted(set(vocab_train["vocab"][0]) | set(vocab_dev["vocab"][0]) | set(vocab_test["vocab"][0])))
vocab_list = [vocab for vocab in vocab_list if vocab not in [" ", "\u0307"]]
print(len(vocab_list))
print(vocab_list)
special_vocab = ["<pad>", "<s>", "</s>", "<unk>", "|"]
vocab_dict = {v: k for k, v in enumerate(special_vocab + vocab_list)}
print(len(vocab_dict))
print(vocab_dict)
for name, age in vocab_dict.items(): # for name, age in dictionary.iteritems(): (for Python 2.x)
if age == 5:
k1 = name
elif age == 8:
k2=name
del vocab_dict[k1]
del vocab_dict[k2]
import json
with open('vocab.json', 'w') as vocab_file:
json.dump(vocab_dict, vocab_file)
from transformers.trainer_utils import get_last_checkpoint
save_dir = "model checkpoints/"
last_checkpoint = None
if os.path.exists(save_dir):
last_checkpoint = get_last_checkpoint(save_dir)
print(last_checkpoint if last_checkpoint else str(None))
from transformers import Wav2Vec2CTCTokenizer
tokenizer = Wav2Vec2CTCTokenizer(
"vocab.json",
bos_token="<s>",
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
word_delimiter_token="|",
do_lower_case=False,
max_length=31
)
text = "از مهمونداری کنار بکشم"
print(" ".join(tokenizer.tokenize(text)))
print(tokenizer.decode(tokenizer.encode(text)))
from transformers import Wav2Vec2FeatureExtractor
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
from transformers import Wav2Vec2Processor
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
if len(processor.tokenizer.get_vocab()) == len(processor.tokenizer):
print(len(processor.tokenizer))
if not os.path.exists(save_dir):
print("Saving ...")
processor.save_pretrained(save_dir)
print("Saved!")
import torchaudio
import librosa
target_sampling_rate = 16_000
def speech_file_to_array_fn(batch):
speech_array, sampling_rate = torchaudio.load(batch["path"])
speech_array = speech_array.squeeze().numpy()
speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, target_sampling_rate)
batch["speech"] = speech_array
batch["sampling_rate"] = target_sampling_rate
batch["duration_in_seconds"] = len(batch["speech"]) / target_sampling_rate
batch["target_text"] = batch["sentence"]
return batch
common_voice_train = common_voice_train.map(speech_file_to_array_fn, remove_columns=common_voice_train.column_names)
common_voice_dev = common_voice_dev.map(speech_file_to_array_fn, remove_columns=common_voice_dev.column_names)
common_voice_test = common_voice_test.map(speech_file_to_array_fn, remove_columns=common_voice_test.column_names)
#print(common_voice_train[0]["sampling_rate"])
#print(common_voice_test[0]["sampling_rate"])
min_duration_in_seconds = 5.0
max_duration_in_seconds = 10.0
def filter_by_max_duration(batch):
return min_duration_in_seconds <= batch["duration_in_seconds"] <= max_duration_in_seconds
print(f"Split sizes [BEFORE]: {len(common_voice_train)} train and {len(common_voice_test)} validation.")
_common_voice_train = common_voice_train.filter(filter_by_max_duration)
_common_voice_dev = common_voice_dev
_common_voice_test = common_voice_test
# _common_voice_test = common_voice_test.filter(filter_by_max_duration, num_proc=4)
print(f"Split sizes [AFTER]: {len(_common_voice_train)} train and {len(_common_voice_test)} validation.")
# check that all files have the correct sampling rate
def prepare_dataset(batch):
assert (
len(set(batch["sampling_rate"])) == 1), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
with processor.as_target_processor():
batch["labels"] = processor(batch["target_text"]).input_ids
return batch
_common_voice_train = _common_voice_train.map(prepare_dataset, remove_columns=_common_voice_train.column_names, batch_size=4, batched=True)
_common_voice_dev = _common_voice_dev.map(prepare_dataset, remove_columns=_common_voice_dev.column_names, batch_size=4, batched=True)
_common_voice_test = _common_voice_test.map(prepare_dataset, remove_columns=_common_voice_test.column_names, batch_size=4, batched=True)
#_common_voice_train.set_format(type='torch', columns=['input_values', 'labels'])
#_common_voice_dev.set_format(type='torch', columns=['input_values', 'labels'])
#_common_voice_test.set_format(type='torch', columns=['input_values', 'labels'])
###############################################################################################################
#torch.cuda.empty_cache()
#print(torch.cuda.memory_summary(device=None, abbreviated=False))
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
#dataclass
class DataCollatorCTCWithPadding:
processor: Wav2Vec2Processor
padding: Union[bool, str] = True
max_length: Optional[int] = None
max_length_labels: Optional[int] = None
pad_to_multiple_of: Optional[int] = None
pad_to_multiple_of_labels: Optional[int] = None
def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
input_features = [{"input_values": feature["input_values"]} for feature in features]
label_features = [{"input_ids": feature["labels"]} for feature in features]
batch = self.processor.pad(
input_features,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors="pt",
)
with self.processor.as_target_processor():
labels_batch = self.processor.pad(
label_features,
padding=self.padding,
max_length=self.max_length_labels,
#max_length=64,
pad_to_multiple_of=self.pad_to_multiple_of_labels,
return_tensors="pt",
)
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
batch["labels"] = labels
return batch
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
wer_metric = load_metric("wer")
import random
def compute_metrics(pred):
pred_logits = pred.predictions
pred_ids = np.argmax(pred_logits, axis=-1)
pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
pred_str = processor.batch_decode(pred_ids)
label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
if isinstance(label_str, list):
if isinstance(pred_str, list) and len(pred_str) == len(label_str):
for index in random.sample(range(len(label_str)), 3):
print(f'reference: "{label_str[index]}"')
print(f'predicted: "{pred_str[index]}"')
else:
for index in random.sample(range(len(label_str)), 3):
print(f'reference: "{label_str[index]}"')
print(f'predicted: "{pred_str}"')
wer = wer_metric.compute(predictions=pred_str, references=label_str)
return {"wer": wer}
from transformers import Wav2Vec2ForCTC, Wav2Vec2Config
configuration = Wav2Vec2Config(hidden_size=256, num_hidden_layers=6, num_attention_heads=6, intermediate_size=1024)
model_args ={}
print('haaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa')
print(len(processor.tokenizer.get_vocab()))
print('haaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa')
model = Wav2Vec2ForCTC.from_pretrained(
"facebook/wav2vec2-large-xlsr-53" if not last_checkpoint else last_checkpoint,
#model_name_or_path if not last_checkpoint else last_checkpoint,
attention_dropout=0.1,
#hidden_size=256,
#num_hidden_layers=8,
#num_attention_heads=2,
#intermediate_size=256,
hidden_dropout=0.1,
feat_proj_dropout=0.0,
mask_time_prob=0.05,
layerdrop=0.1,
gradient_checkpointing=True,
ctc_loss_reduction="mean",
ctc_zero_infinity=True,
bos_token_id=processor.tokenizer.bos_token_id,
eos_token_id=processor.tokenizer.eos_token_id,
pad_token_id=processor.tokenizer.pad_token_id,
vocab_size=len(processor.tokenizer.get_vocab())
#vocab_size=64
)
model.config = configuration
model.freeze_feature_extractor()
model = model.to(torch.device("cuda"))
from transformers import TrainingArguments
training_args = TrainingArguments(
output_dir=save_dir,
group_by_length=True,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_accumulation_steps=2,
evaluation_strategy="steps",
num_train_epochs=0.5,
fp16=True,
#save_steps=10,
#eval_steps=10,
#logging_steps=10,
learning_rate=1e-4,
#warmup_steps=500,
#save_total_limit=2,
)
from transformers import Trainer
trainer = Trainer(
model=model,
data_collator=data_collator,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=_common_voice_train,
eval_dataset=_common_voice_test,
tokenizer=processor.feature_extractor,
)
torch.cuda.empty_cache()
train_result = trainer.train()
metrics = train_result.metrics
max_train_samples = len(_common_voice_train)
metrics["train_samples"] = min(max_train_samples, len(_common_voice_train))
trainer.save_model()
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
I’d be really thankful for anyone who can solve my problem. Please help me as this is driving me mad.
P.S.: There’s a line which tries to set the format of _common_dataset to torch tensor. However, even when I run it, i still encounter errors like I mentioned.

Extract all the elements within a class in python

I have made the following, in order to extract the first element in a class:
if var_source == "Image":
outcsvfile = 'Image_Ids' + file + '_' + timestamp +'.csv'
with open(outcsvfile, 'w', encoding='utf-8', newline='') as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerow(['ax','physical_id'])
for i in range(len(var_ax)):
browser.get('https://test.com' + str(mpid) + '&ax=' + var_ax[i])
self.master.update()
self.status.config(text = str(i+1) + "/" + str(len(var_ax)) + " Extracting AX: " + var_ax[i])
try:
ph_id = browser.find_element_by_xpath("//div[contains(#class, 'a-image-wrapper')]").get_attribute("alt")
print(i+1,': extract AX:',var_ax[i])
with open(outcsvfile, 'a+', encoding='utf-8', newline='') as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerow([var_ax[i],ph_id])
except:
print(i+1,': extract AX:',var_ax[i])
with open(outcsvfile, 'a+', encoding='utf-8', newline='') as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerow([var_ax[i],'[missing AX]'])
I have 2 questions:
How can I extract all the physical_ids in the same cell separated by a comma (cell B2 = "physical_id1, physical_id2, physical_id3")?
How can I sum the number of physical_ids exported in column C (ex: for C2 we will have 3, because in B2 we have 3 physical_ids exported)?
The source code:
<div alt="51d5gBEzhjL" style="width:220px;float:left;margin-left:34px;margin-bottom:10px;border:1px solid #D0D0D0" class="a-image-wrapper a-lazy-loaded MAIN GLOBAL 51d5gBEzhjL"><h1 class="a-size-medium a-spacing-mini a-spacing-top-mini a-color-information a-text-center a-text-bold">MAIN</h1><h1 class="a-size-base a-spacing-mini a-spacing-top-mini a-color-information a-text-center a-text-bold"> ou GLOBAL / Merch 1</h1></div>
<h1 class="a-size-medium a-spacing-mini a-spacing-top-mini a-color-information a-text-center a-text-bold">FACT</h1>
<h1 class="a-size-base a-spacing-mini a-spacing-top-mini a-color-information a-text-center a-text-bold"> ou GLOBAL / Merch 1</h1>
<span class="a-declarative" data-action="a-modal"><center><img class="ecx" id="51S+wTs36zL" src="https://test.com/images/I/51S+wTs36zL._AA200_.jpg" alt="51S+wTs36zL"></center></span>
<center>
<img class="ecx" id="51S+wTs36zL" src="https://test.com/images/I/51S+wTs36zL._AA200_.jpg" alt="51S+wTs36zL">
</center>
</span>
<h5 class="physical-id">51S+wTs36zL</h5>
<h1 class="a-size-medium a-spacing-mini a-spacing-top-mini a-color-information a-text-center a-text-bold" style="background:#D0D0D0">UPLOADED</h1>
<h1 class="a-size-base a-spacing-mini a-spacing-top-mini a-color-information a-text-center a-text-bold">19/Apr/2016:17:45:40</h1>
</div>

This worked for me and resolved both my questions:
if var_source == "Image":
outcsvfile = 'Image_Ids-' + file + '_' + timestamp +'.csv'
with open(outcsvfile, 'w', encoding='utf-8', newline='') as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerow(['ax','physical_id','image_count'])
for i in range(len(var_ax)):
browser.get('https://test.com' + str(mpid) + '&ax=' + var_ax[i])
self.master.update()
self.status.config(text = str(i+1) + "/" + str(len(var_ax)) + " Extracting AX: " + var_ax[i])
try:
ph_id = browser.find_element_by_xpath("//div[contains(#class, 'a-image-wrapper')]").get_attribute("alt")
ids1 = browser.find_elements_by_class_name("physical-id")
ids1Text = []
for a in ids1:
ids1Text.append(a.text)
nr = str(len(ids1))
ax = ', '.join(ids1Text)
print(i+1,': extract AX:',var_ax[i])
with open(outcsvfile, 'a+', encoding='utf-8', newline='') as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerow([var_ax[i], ax, nr])
except:
print(i+1,': extract AX:',var_ax[i])
with open(outcsvfile, 'a+', encoding='utf-8', newline='') as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerow([var_ax[i],'[missing AX]'])

Python3 urllib.error.URLError: <urlopen error [Errno 110] Connection timed out> on server

I have script which have to build an new .xml file from .xml file on some URL, when I run script on my PC, everything is okay. But after I try to run on my server, it throws urllib.error.URLError: <urlopen error [Errno 110] Connection timed out>
I don't know why and I didn't find any good source where this problem was described, so I want this script running on my server automatically as CRON, but I can't.. Does not works..
Code:
#!/usr/bin/env python3
import urllib.request, urllib.parse, urllib.error
import xml.etree.ElementTree as ET
url = 'http://eshop.cobi.cz/modules/xmlfeeds/xml_files/feed_5.xml'
uh = urllib.request.urlopen(url, timeout=30)
tree = ET.parse(uh)
root = tree.getroot()
data = ET.Element('SHOP')
for r in root.findall('SHOPITEM'):
name = r.find('PRODUCT').text
try:
description = r.find('DESCRIPTION').text
except AttributeError:
description = ""
continue
price = r.find('WHOLESALEPRICE').text
new_price = (float(price) * 0.8)
final_price = round(new_price, 2)
final_price = str(final_price)
ean = r.find('EAN').text
sku = r.find('PRODUCTNO').text
for i in r.findall('IMAGES'):
img = i.find('IMGURL').text
status = r.find('ACTIVE').text
if status == '1':
status = '1'
else:
status = '0'
continue
element2 = ET.SubElement(data, 'SHOPITEM')
s_elem2_1 = ET.SubElement(element2, 'PRODUCTNAME')
s_elem2_2 = ET.SubElement(element2, 'PRODUCT')
s_elem2_3 = ET.SubElement(element2, 'DESCRIPTION')
s_elem2_4 = ET.SubElement(element2, 'PRICE_VAT')
s_elem2_5 = ET.SubElement(element2, 'EAN')
s_elem2_6 = ET.SubElement(element2, 'PRODUCTNO')
s_elem2_7 = ET.SubElement(element2, 'IMGURL')
s_elem2_8 = ET.SubElement(element2, 'DELIVERY_DATE')
s_elem2_9 = ET.SubElement(element2, 'CATEGORYTEXT')
s_elem2_10 = ET.SubElement(element2, 'MANUFACTURER')
s_elem2_11 = ET.SubElement(element2, 'ACTIVE')
s_elem2_1.text = "Cobi " + name
s_elem2_2.text = "Cobi " + name
s_elem2_3.text = description
s_elem2_4.text = final_price
s_elem2_5.text = ean
s_elem2_6.text = sku
s_elem2_7.text = img
s_elem2_8.text = "7"
s_elem2_9.text = "Heureka.cz | Dětské zboží | Hračky | Stavebnice | Stavebnice Cobi"
s_elem2_10.text = "Cobi"
s_elem2_11.text = status
xml_content = ET.tostring(data)
with open('cobifeed.xml', 'wb') as f:
f.write(xml_content)
f.close()

try ping shop.cobi.cz from cmd/bash and wget http://eshop.cobi.cz/modules/xmlfeeds/xml_files/feed_5.xml
Probably You dont have network access to server eshop.cobi.cz

Problem with super() using Multiple Inheritance with 2 Parent Classes and 1 Base Class

The error i'm having is:
print("Cód. Avião:", self.codigo_aviao)
AttributeError: 'Comprar_Bilhete' object has no attribute 'codigo_aviao'
I tried several options with super() but got no luck.
How can the Class "Comprar_Bilhete" grab the attribute 'codigo_aviao'?
I tried to put it in super() as a parameter but it gave me an error too!
class Pessoa():
def __init__(self, nome, apelido, idade, cc, nacionalidade):
self.nome = nome
self.apelido = apelido
self.idade = idade
self.cartaocidadao = cc
self.nacionalidade = nacionalidade
def visualizar_pessoa(self):
print("O", self.nome, "\b", self.apelido, "tem", self.idade, "anos, possui Cartão do Cidadão com o Nº", self.cartaocidadao, "e tem nacionalidade", self.nacionalidade, " \b.")
class Voo():
def __init__(self, companhia, cod_voo, cod_aviao, data_partida, horario_partida, data_chegada, horario_chegada, aeroporto_partida, terminal_aeroporto_partida,
aeroporto_chegada, terminal_aeroporto_chegada, tipo_de_bagagem): # Construtor.
self.companhia_aerea = companhia
self.codigo_aviao = cod_aviao
self.codigo_voo = cod_voo
self.data_voo_partida = data_partida
self.horario_partida = horario_partida
self.data_voo_chegada = data_chegada
self.horario_chegada = horario_chegada
self.aeroporto_partida = aeroporto_partida
self.terminal_aeroporto_partida = terminal_aeroporto_partida
self.aeroporto_chegada = aeroporto_chegada
self.terminal_aeroporto_chegada = terminal_aeroporto_chegada
self.tipo_de_bagagem = tipo_de_bagagem
def visualizar_dados_aviao(self):
print("Companhia:", self.companhia_aerea, "\nCód. Avião:", self.codigo_aviao)
class Comprar_Bilhete(Pessoa, Voo):
def __init__(self, nome, apelido, idade, cc, nacionalidade, companhia, cod_voo, cod_aviao, data_partida, horario_partida, data_chegada, horario_chegada,
aeroporto_partida, terminal_aeroporto_partida, aeroporto_chegada, terminal_aeroporto_chegada, tipo_de_bagagem, preco):
super().__init__(nome, apelido, idade, cc, nacionalidade) # Usou-se o "super()" para tornar o código mais simples, em relação aos Atributos da Classe "Pessoa".
self.preco_bilhete = preco
def visualizar_custo_bilhete(self):
print("O preço do bilhete de avião são", self.preco_bilhete, "euros.")
def visualizar_pessoa(self):
print("O", self.nome, "\b", self.apelido, "tem", self.idade, "anos.")
def visualizar_dados_aviao(self):
print("Cód. Avião:", self.codigo_aviao)
cliente1 = Comprar_Bilhete("Pedro", "Figueiredo", 49, 9876543, "Portuguesa", "Easyjet", "EJ1011", "FT4537", "27-08-2020", "23:05", "28-08-2020", "01:45",
"Humberto Delgado - Lisboa - PT", "Terminal 1", "Stansted - Hertfordshire - UK", "Terminal 3", "Bagagem de Porão + Mala de Mão", 275.48)
cliente1.visualizar_custo_bilhete()
print()
cliente1.visualizar_pessoa()
print()
cliente1.visualizar_dados_aviao()
print()

cgi:error pid 1217 client {IP}:55101 malformed header from script 'master3.py': Bad header: Database aangemaakt en succesv

I'm currently working on a CGI python script, now I'm trying to test everything out on a Linux CentOS 7 virtual machine. When I'm going to the webpage I'm getting a 500 internal Server Error... my code looks like this:
Master3.py (192.168.234.2) (192.168.234.2/cgi-bin/master3.py)
#!/usr/bin/python3
###importeren van modules om deze vervolgens te kunnen gebruiken in het script###
import cgi, cgitb
import matplotlib.pyplot as plt
import io
import sqlite3
import base64
#maak een instance van een field storage
cgitb.enable()
form = cgi.FieldStorage()
#het verkrijgen van data van de velden die vanuit de agent worden opgestuurd.
machine = form.getvalue("machine")
cpu = form.getvalue("cpu")
ram = form.getvalue("ram")
total_mem = form.getvalue("total")
used_mem = form.getvalue("used")
free_mem = form.getvalue("free")
#print ("Content-type:text/html\n\n")
def create_table():
try:
con = sqlite3.connect('toets_school.db')
cursor = con.cursor()
print("Database aangemaakt en succesvol verbonden met SQLite")
query_create_table = """CREATE TABLE IF NOT EXISTS data (
machine VARCHAR UNIQUE,
cpu REAL,
total_mem REAL,
used_mem REAL,
free_mem REAL); """
cursor.execute(query_create_table)
print ("SQLite table aangemaakt")
cursor.close()
except sqlite3.Error as error:
print ("Error while creating a sqlite table", error)
finally:
if (con):
con.close()
print("sqlite connection is closed")
def insert_data(machine, cpu, total_mem, used_mem, free_mem):
try:
con = sqlite3.connect('toets_school.db')
cursor = con.cursor()
print("Database aangemaakt en succesvol verbonden met SQLite")
insert_query = """INSERT INTO data (machine, cpu, total_mem, used_mem, free_mem)
VALUES (?, ?, ?, ?, ?);"""
verkregen_data = (machine, cpu, total_mem, used_mem, free_mem)
cursor.execute(insert_query, verkregen_data)
con.commit()
print ("Total", cursor.rowcount,
"Data is succesvol in database toets_school.db geschreven")
con.commit()
cursor.close()
except sqlite3.Error as error:
print ("Failed to insert data into table", error)
finally:
if (con):
con.close()
print("SQLite connection is closed")
def read_data():
try:
con = sqlite3.connect('toets_school.db')
cursor = con.cursor()
print("Database aangemaakt en succesvol verbonden met SQLite")
read_query = """SELECT * from data"""
cursor.execute(read_query)
records = cursor.fetchall()
print ("Totaal aantal rijen zijn: ", len(records))
print ("print elke rij")
data = []
for rij in records:
data.append(rij[0])
data.append(rij[1])
data.append(rij[2])
data.append(rij[3])
data.append(rij[4])
# print (data)
print ("Machine: ", rij[0])
print ("CPU: ", rij[1])
print ("Totaal geheugen: ", rij[2])
print ("Gebruikt geheugen: ", rij[3])
print ("Vrije geheugen: ", rij[4])
print("\n")
return data
cursor.close()
except sqlite3.Error as error:
print("Failed to read data from table", error)
finally:
if (con):
con.close()
print("SQLite connection is closed")
#gegevens klaarmaken om te gebruiken in de grafieken
cpulist = []
ramlist = []
used_memlist = []
free_memlist = []
def chart_cpu():
plt.plot(cpulist)
plt.title("CPU gebruik")
plt.xlabel('minuten')
plt.ylabel('percentage(%)')
buffer1 = io.BytesIO()
plt.grid(True)
plt.savefig(buffer1, format="png")
plt.show()
return buffer1
def chart_ram():
plt.plot(ramlist)
plt.title("Geheugen")
plt.xlabel('minuten')
plt.ylabel('percentage (%)')
buffer2 = io.BytesIO()
plt.grid(True)
plt.savefig(buffer2, format="png")
plt.show()
return buffer2
def chart_used_mem():
plt.plot(used_memlist)
plt.title("Gebruikt geheugen")
plt.xlabel('Geheugen')
plt.ylabel('Tijd')
buffer3 = io.BytesIO()
plt.grid(True)
plt.savefig(buffer3, format="png")
plt.show()
return buffer3
def chart_free_mem():
plt.plot(free_memlist)
plt.title("Geheugen")
plt.xlabel('Geheugen')
plt.ylabel('Tijd')
buffer4 = io.BytesIO()
plt.grid(True)
plt.savefig(buffer4, format="png")
plt.show()
return buffer4
create_table()
insert_data (machine, cpu, total_mem, used_mem, free_mem)
read_data()
chart_cpu()
chart_ram()
chart_used_mem()
chart_free_mem()
var1 = chart_cpu()
var2 = chart_ram()
var3 = chart_used_mem()
var4 = chart_used_mem()
print ("Content-type:text/html\n\n")
print ("<center>")
print ('De naam van deze grafieken is van :', machine)
print ('De totale geheugen van deze machine is: ', total_mem,'GB')
print ("<br>")
print ("<html><head><title>Website HAF sportschool</title></head><body>")
print("<img src='data:image/png;base64,"+str(base64.b64encode(var1.getvalue()).decode('ascii'))+"' />")
print("<img src='data:image/png;base64,"+str(base64.b64encode(var2.getvalue()).decode('ascii'))+"' />")
print("<img src='data:image/png;base64,"+str(base64.b64encode(var3.getvalue()).decode('ascii'))+"' />")
print("<img src='data:image/png;base64,"+str(base64.b64encode(var4.getvalue()).decode('ascii'))+"' />")
print ("</body></html>")
Anyone that knows why I'm getting the error: [cgi:error] [pid 1217] [client 192.168.234.1:55101] malformed header from script 'master3.py': Bad header: Database aangemaakt en succesv
httpd conf looks like this:
AllowOverride All
Options +ExecCGI
AddHandler cgi-script .py .cgi
Require all granted
</Directory>

CGI's are responsible for printing out HTTP response headers, then a terminating newline, then the body. When you commented out the "print" statement, you removed both the only header being sent and the extra newline.
Restore that line, change the actual content-type if you want, and switch from \n to \r\n.

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Simplify large-data processing script - python-3.x

Related

HugginFace dataset error: RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.HalfTensor) should be the same or

Extract all the elements within a class in python

Python3 urllib.error.URLError: <urlopen error [Errno 110] Connection timed out> on server

Problem with super() using Multiple Inheritance with 2 Parent Classes and 1 Base Class

cgi:error pid 1217 client {IP}:55101 malformed header from script 'master3.py': Bad header: Database aangemaakt en succesv

Categories

Resources