Scraping multiple wikitables using Python - python-3.x

I am very beginner to Python. I have a task to scrape information table from wikipedia page. I would like to scrape using the below code:
from pandas.io.html import read_html
page = requests.get('https://de.wikipedia.org/wiki/Köln')
wikitables = read_html(page, attrs={"class":"hintergrundfarbe5 float-right toptextcells infobox"})
print("Extracted {num} wikitables".format(num=len(wikitables)))
wikitables[0]
But I get the below error due to the special character in the Url as Köln: Please help me where to do the modifications in the program to scrape the information.
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-168-d9bd1e1d7548> in <module>
2 page = requests.get('https://de.wikipedia.org/wiki/Köln')
3 Soup = BeautifulSoup(page.content)
----> 4 wikitables = read_html(page, attrs={"class":"hintergrundfarbe5 float-right toptextcells infobox"})
5 print("Extracted {num} wikitables".format(num=len(wikitables)))
6
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\html.py in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, tupleize_cols, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only)
1092 decimal=decimal, converters=converters, na_values=na_values,
1093 keep_default_na=keep_default_na,
-> 1094 displayed_only=displayed_only)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\html.py in _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs)
914 break
915 else:
--> 916 raise_with_traceback(retained)
917
918 ret = []
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\compat\__init__.py in raise_with_traceback(exc, traceback)
418 if traceback == Ellipsis:
419 _, _, traceback = sys.exc_info()
--> 420 raise exc.with_traceback(traceback)
421 else:
422 # this version of raise is a syntax error in Python 3
TypeError: Cannot read object of type 'Response'

This has nothing to do with beautiful Köln...
You need to change
wikitables = read_html(page, attrs={"..."})
to
wikitables = read_html(page.text, attrs={"..."})
and it should work.

Related

df.query("'string'") produces ValueError: NumExpr 2 does not support Unicode as a dtype

If anyone has a solution for how I can get this to work please let me know. I would prefer not downgrading python to 2.x.
I have tried to remaps some of the columns to different dtypes. I think python 3.x may be storing strings as unicode and perhaps pandas and/or numexpr does not support this with the versions I am on.
pandas 1.1.5
numexpr 2.8.1
numpy 1.19.5
python 3.6.9
data = [['tom', 10], ['nick', 15], ['juli', 14]]
df = pd.DataFrame(data, columns=['Name', 'Age'])
df['Name'] = df['Name'].astype('string')
df.dtypes
df.query("'tom'")
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-37-a5f548d874ef> in <module>()
7 df['Name'] = df['Name'].astype('string')
8 df.dtypes
----> 9 df.query("'tom'")
/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in query(self, expr, inplace, **kwargs)
3343 kwargs["level"] = kwargs.pop("level", 0) + 1
3344 kwargs["target"] = None
-> 3345 res = self.eval(expr, **kwargs)
3346
3347 try:
/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in eval(self, expr, inplace, **kwargs)
3473 kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers)
3474
-> 3475 return _eval(expr, inplace=inplace, **kwargs)
3476
3477 def select_dtypes(self, include=None, exclude=None) -> "DataFrame":
/usr/local/lib/python3.6/dist-packages/pandas/core/computation/eval.py in eval(expr, parser, engine, truediv, local_dict, global_dict, resolvers, level, target, inplace)
344 eng = _engines[engine]
345 eng_inst = eng(parsed_expr)
--> 346 ret = eng_inst.evaluate()
347
348 if parsed_expr.assigner is None:
/usr/local/lib/python3.6/dist-packages/pandas/core/computation/engines.py in evaluate(self)
71
72 # make sure no names in resolvers and locals/globals clash
---> 73 res = self._evaluate()
74 return reconstruct_object(
75 self.result_type, res, self.aligned_axes, self.expr.terms.return_type
/usr/local/lib/python3.6/dist-packages/pandas/core/computation/engines.py in _evaluate(self)
112 scope = env.full_scope
113 _check_ne_builtin_clash(self.expr)
--> 114 return ne.evaluate(s, local_dict=scope)
115
116
~/.local/lib/python3.6/site-packages/numexpr/necompiler.py in evaluate(ex, local_dict, global_dict, out, order, casting, **kwargs)
813 # Create a signature
814 signature = [(name, getType(arg)) for (name, arg) in
--> 815 zip(names, arguments)]
816
817 # Look up numexpr if possible.
~/.local/lib/python3.6/site-packages/numexpr/necompiler.py in <listcomp>(.0)
812
813 # Create a signature
--> 814 signature = [(name, getType(arg)) for (name, arg) in
815 zip(names, arguments)]
816
~/.local/lib/python3.6/site-packages/numexpr/necompiler.py in getType(a)
689 return bytes
690 if kind == 'U':
--> 691 raise ValueError('NumExpr 2 does not support Unicode as a dtype.')
692 raise ValueError("unknown type %s" % a.dtype.name)
693
ValueError: NumExpr 2 does not support Unicode as a dtype.
The only reason you have a scuffed error message that references anything about dtypes, is because you're using the NumExpr engine.
Here, using the python engine, getting a KeyError is clearer:
>>> df.query("'tom'", engine='python')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/bert2me/miniconda3/envs/deleteme/lib/python3.6/site-packages/pandas/core/frame.py", line 3348, in query
result = self.loc[res]
File "/home/bert2me/miniconda3/envs/deleteme/lib/python3.6/site-packages/pandas/core/indexing.py", line 879, in __getitem__
return self._getitem_axis(maybe_callable, axis=axis)
File "/home/bert2me/miniconda3/envs/deleteme/lib/python3.6/site-packages/pandas/core/indexing.py", line 1110, in _getitem_axis
return self._get_label(key, axis=axis)
File "/home/bert2me/miniconda3/envs/deleteme/lib/python3.6/site-packages/pandas/core/indexing.py", line 1059, in _get_label
return self.obj.xs(label, axis=axis)
File "/home/bert2me/miniconda3/envs/deleteme/lib/python3.6/site-packages/pandas/core/generic.py", line 3493, in xs
loc = self.index.get_loc(key)
File "/home/bert2me/miniconda3/envs/deleteme/lib/python3.6/site-packages/pandas/core/indexes/range.py", line 358, in get_loc
raise KeyError(key)
KeyError: 'tom'
As wjandrea pointed out... this isn't a valid query statement to begin with... did you mean?:
>>> df.query("Name == 'tom'")
Name Age
0 tom 10

Can not find the pytorch model when loading BERT model in Python

I am following this article to find the text similarity.
The code I have is this:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
documents = [
"Vodafone Wins ₹ 20,000 Crore Tax Arbitration Case Against Government",
"Voda Idea shares jump nearly 15% as Vodafone wins retro tax case in Hague",
"Gold prices today fall for 4th time in 5 days, down ₹6500 from last month high",
"Silver futures slip 0.36% to Rs 59,415 per kg, down over 12% this week",
"Amazon unveils drone that films inside your home. What could go wrong?",
"IPHONE 12 MINI PERFORMANCE MAY DISAPPOINT DUE TO THE APPLE B14 CHIP",
"Delhi Capitals vs Chennai Super Kings: Prithvi Shaw shines as DC beat CSK to post second consecutive win in IPL",
"French Open 2020: Rafael Nadal handed tough draw in bid for record-equaling 20th Grand Slam"
]
model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')
I get an error when running the above code:
Full:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\anaconda3\envs\py3_nlp\lib\tarfile.py in nti(s)
188 s = nts(s, "ascii", "strict")
--> 189 n = int(s.strip() or "0", 8)
190 except ValueError:
ValueError: invalid literal for int() with base 8: 'ld_tenso'
During handling of the above exception, another exception occurred:
InvalidHeaderError Traceback (most recent call last)
~\anaconda3\envs\py3_nlp\lib\tarfile.py in next(self)
2298 try:
-> 2299 tarinfo = self.tarinfo.fromtarfile(self)
2300 except EOFHeaderError as e:
~\anaconda3\envs\py3_nlp\lib\tarfile.py in fromtarfile(cls, tarfile)
1092 buf = tarfile.fileobj.read(BLOCKSIZE)
-> 1093 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
1094 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
~\anaconda3\envs\py3_nlp\lib\tarfile.py in frombuf(cls, buf, encoding, errors)
1034
-> 1035 chksum = nti(buf[148:156])
1036 if chksum not in calc_chksums(buf):
~\anaconda3\envs\py3_nlp\lib\tarfile.py in nti(s)
190 except ValueError:
--> 191 raise InvalidHeaderError("invalid header")
192 return n
InvalidHeaderError: invalid header
During handling of the above exception, another exception occurred:
ReadError Traceback (most recent call last)
~\anaconda3\envs\py3_nlp\lib\site-packages\torch\serialization.py in _load(f, map_location,
pickle_module, **pickle_load_args)
594 try:
--> 595 return legacy_load(f)
596 except tarfile.TarError:
~\anaconda3\envs\py3_nlp\lib\site-packages\torch\serialization.py in legacy_load(f)
505
--> 506 with closing(tarfile.open(fileobj=f, mode='r:', format=tarfile.PAX_FORMAT)) as
tar, \
507 mkdtemp() as tmpdir:
~\anaconda3\envs\py3_nlp\lib\tarfile.py in open(cls, name, mode, fileobj, bufsize, **kwargs)
1590 raise CompressionError("unknown compression type %r" % comptype)
-> 1591 return func(name, filemode, fileobj, **kwargs)
1592
~\anaconda3\envs\py3_nlp\lib\tarfile.py in taropen(cls, name, mode, fileobj, **kwargs)
1620 raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
-> 1621 return cls(name, mode, fileobj, **kwargs)
1622
~\anaconda3\envs\py3_nlp\lib\tarfile.py in __init__(self, name, mode, fileobj, format, tarinfo, dereference, ignore_zeros, encoding, errors, pax_headers, debug, errorlevel, copybufsize)
1483 self.firstmember = None
-> 1484 self.firstmember = self.next()
1485
~\anaconda3\envs\py3_nlp\lib\tarfile.py in next(self)
2310 elif self.offset == 0:
-> 2311 raise ReadError(str(e))
2312 except EmptyHeaderError:
ReadError: invalid header
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
~\anaconda3\envs\py3_nlp\lib\site-packages\transformers\modeling_utils.py in from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
1210 try:
-> 1211 state_dict = torch.load(resolved_archive_file, map_location="cpu")
1212 except Exception:
~\anaconda3\envs\py3_nlp\lib\site-packages\torch\serialization.py in load(f, map_location, pickle_module, **pickle_load_args)
425 pickle_load_args['encoding'] = 'utf-8'
--> 426 return _load(f, map_location, pickle_module, **pickle_load_args)
427 finally:
~\anaconda3\envs\py3_nlp\lib\site-packages\torch\serialization.py in _load(f, map_location, pickle_module, **pickle_load_args)
598 # .zip is used for torch.jit.save and will throw an un-pickling error here
--> 599 raise RuntimeError("{} is a zip archive (did you mean to use torch.jit.load()?)".format(f.name))
600 # if not a tarfile, reset file offset and proceed
RuntimeError: C:\Users\user1/.cache\torch\sentence_transformers\sentence-transformers_bert-base-nli-mean-tokens\pytorch_model.bin is a zip archive (did you mean to use torch.jit.load()?)
During handling of the above exception, another exception occurred:
OSError Traceback (most recent call last)
<ipython-input-3-bba56aac60aa> in <module>
----> 1 model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')
~\anaconda3\envs\py3_nlp\lib\site-packages\sentence_transformers\SentenceTransformer.py in __init__(self, model_name_or_path, modules, device, cache_folder)
88
89 if os.path.exists(os.path.join(model_path, 'modules.json')): #Load as SentenceTransformer model
---> 90 modules = self._load_sbert_model(model_path)
91 else: #Load with AutoModel
92 modules = self._load_auto_model(model_path)
~\anaconda3\envs\py3_nlp\lib\site-packages\sentence_transformers\SentenceTransformer.py in _load_sbert_model(self, model_path)
820 for module_config in modules_config:
821 module_class = import_from_string(module_config['type'])
--> 822 module = module_class.load(os.path.join(model_path, module_config['path']))
823 modules[module_config['name']] = module
824
~\anaconda3\envs\py3_nlp\lib\site-packages\sentence_transformers\models\Transformer.py in load(input_path)
122 with open(sbert_config_path) as fIn:
123 config = json.load(fIn)
--> 124 return Transformer(model_name_or_path=input_path, **config)
125
126
~\anaconda3\envs\py3_nlp\lib\site-packages\sentence_transformers\models\Transformer.py in __init__(self, model_name_or_path, max_seq_length, model_args, cache_dir, tokenizer_args, do_lower_case, tokenizer_name_or_path)
27
28 config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir)
---> 29 self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir)
30 self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path, cache_dir=cache_dir, **tokenizer_args)
31
~\anaconda3\envs\py3_nlp\lib\site-packages\transformers\models\auto\auto_factory.py in from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
393 if type(config) in cls._model_mapping.keys():
394 model_class = _get_model_class(config, cls._model_mapping)
--> 395 return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
396 raise ValueError(
397 f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
~\anaconda3\envs\py3_nlp\lib\site-packages\transformers\modeling_utils.py in from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
1212 except Exception:
1213 raise OSError(
-> 1214 f"Unable to load weights from pytorch checkpoint file for '{pretrained_model_name_or_path}' "
1215 f"at '{resolved_archive_file}'"
1216 "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. "
OSError: Unable to load weights from pytorch checkpoint file for 'C:\Users\user1/.cache\torch\sentence_transformers\sentence-transformers_bert-base-nli-mean-tokens\' at 'C:\Users\user1/.cache\torch\sentence_transformers\sentence-transformers_bert-base-nli-mean-tokens\pytorch_model.bin'If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True.
Short:
OSError: Unable to load weights from pytorch checkpoint file for 'C:\Users\user1/.cache\torch\sentence_transformers\sentence-transformers_bert-base-nli-mean-tokens' at 'C:\Users\user1/.cache\torch\sentence_transformers\sentence-transformers_bert-base-nli-mean-tokens\pytorch_model.bin'If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True.
I do have the pytorch_model.bin in the '.cache\torch\sentence_transformers\sentence-transformers_bert-base-nli-mean-tokens' folder.
Why am I getting this error?
The reason for the error seems to be that the pre-trained model weight files are not available or loadable.
You can try that one to load pretrained model weight file:
from transformers import AutoModel
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
Reference: https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens
Also, the model's hugging face page says:
This model is deprecated. Please don't use it as it produces sentence embeddings of low quality. You can find recommended sentence embedding models here: SBERT.net - Pretrained Models
Maybe you might want to take a look.
You may need to use the model without sentence_transformers.
The following code is tweaked from https://www.sbert.net/examples/applications/computing-embeddings/README.html
As I understand it, from the exception you need to pass from_tf=True to AutoModel.
from transformers import AutoTokenizer, AutoModel
import torch
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
return sum_embeddings / sum_mask
#Sentences we want sentence embeddings for
sentences = ['This framework generates embeddings for each input sentence',
'Sentences are passed as a list of string.',
'The quick brown fox jumps over the lazy dog.']
#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens',from_tf=True)
#Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
#Compute token embeddings
with torch.no_grad():
model_output = model(**encoded_input)
#Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

Cytoscape: How do you import ABC file types with py2cytoscape's cyrest api?

I have a file of the type:
A B 0.123
A C 0.84
B D 0.52
...
Where the data are tab separated, and the first and second columns are the nodes, and the third is the associated edge weight.
When trying to import this file into cytoscape using py2cytoscape, I'm receiving an error:
from py2cytoscape import cyrest
fileName="/Users/96v/Documents/lco/lcoAllAt25/lcoAll25/lcoAll25_top0.041pct_data/lcoAll25_top0.041pct.txt"
cyclient = cyrest.cyclient()
cyclient.network.import_file(dataTypeList='string,string,double',
afile=fileName,
delimiters='\t',
indexColumnSourceInteraction="0",
indexColumnTargetInteraction="1",
verbose=True)
'http://localhost:1234/v1/commands/network/import file'
TypeError Traceback (most recent call last)
in
----> 1 cyclient.network.import_file(dataTypeList='string,string,double', afile=fileName, delimiters='\t', indexColumnSourceInteraction="0", indexColumnTargetInteraction="1", defaultInteraction="Edge Attribute",verbose=True)
2
~/opt/anaconda3/lib/python3.8/site-packages/py2cytoscape/cyrest/network.py in import_file(self, dataTypeList, defaultInteraction, delimiters, delimitersForDataList, afile, firstRowAsColumnNames, indexColumnSourceInteraction, indexColumnTargetInteraction, indexColumnTypeInteraction, NetworkViewRendererList, RootNetworkList, startLoadRow, TargetColumnList, verbose)
464 afile,firstRowAsColumnNames,indexColumnSourceInteraction,indexColumnTargetInteraction,
465 indexColumnTypeInteraction,NetworkViewRendererList,RootNetworkList,startLoadRow,TargetColumnList])
--> 466 response=api(url=self.__url+"/import file", PARAMS=PARAMS, method="POST", verbose=verbose)
467 return response
468
~/opt/anaconda3/lib/python3.8/site-packages/py2cytoscape/cyrest/base.py in api(namespace, command, PARAMS, body, host, port, version, method, verbose, url, parse_params)
139 sys.stdout.flush()
140 r = requests.post(url = baseurl, json = PARAMS)
--> 141 verbose_=checkresponse(r, verbose=verbose)
142 if (verbose) or (verbose_):
143 verbose=True
~/opt/anaconda3/lib/python3.8/site-packages/py2cytoscape/cyrest/base.py in checkresponse(r, verbose)
43 if 200 <= status < 300:
44 if verbose:
---> 45 print("response status "+status)
46 sys.stdout.flush()
47 res=None
TypeError: can only concatenate str (not "int") to str
The edge weights aren't being recognized, yet the documentation isn't as verbose for this function.
Any help would be extremely appreciated!
After looking further at the GUI, I realized:
Columns are not 0 indexed.
Verbose has an error in it.
The below code works fine:
from py2cytoscape import cyrest
fileName="pathToFile"
cyclient = cyrest.cyclient()
collection = cyclient.network.import_file(dataTypeList='string,string,double',
afile=fileName,
delimiters='\t',
indexColumnSourceInteraction="1",
indexColumnTargetInteraction="2",
defaultInteraction="interacts with")

How do you translate the rows of French and Arabic text in the columns into English?

I want to translate a dataframe column that's in French and Arabic:
0 Chef de projet
...
6 professeur
7 Chef de projet
8 مدير شركة
I tried:
from googletrans import Translator
translator = Translator()
df['new_professionactuelle']= df['new_professionactuelle'].apply(translator.translate)
But obtained
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-211-90b46ab0043a> in <module>
1 from googletrans import Translator
2 translator = Translator()
----> 3 df['new_professionactuelle']= df['new_professionactuelle'].apply(translator.translate)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
3589 else:
3590 values = self.astype(object).values
-> 3591 mapped = lib.map_infer(values, f, convert=convert_dtype)
3592
3593 if len(mapped) and isinstance(mapped[0], Series):
pandas/_libs/lib.pyx in pandas._libs.lib.map_infer()
C:\ProgramData\Anaconda3\lib\site-packages\googletrans\client.py in translate(self, text, dest, src)
170
171 origin = text
--> 172 data = self._translate(text, dest, src)
173
174 # this code will be updated when the format is changed.
C:\ProgramData\Anaconda3\lib\site-packages\googletrans\client.py in _translate(self, text, dest, src)
73 text = text.decode('utf-8')
74
---> 75 token = self.token_acquirer.do(text)
76 params = utils.build_params(query=text, src=src, dest=dest,
77 token=token)
C:\ProgramData\Anaconda3\lib\site-packages\googletrans\gtoken.py in do(self, text)
199 def do(self, text):
200 self._update()
--> 201 tk = self.acquire(text)
202 return tk
C:\ProgramData\Anaconda3\lib\site-packages\googletrans\gtoken.py in acquire(self, text)
144 a = []
145 # Convert text to ints
--> 146 for i in text:
147 val = ord(i)
148 if val < 0x10000:
TypeError: 'NoneType' object is not iterable
I tried to get the rows that might be NoneType:
df['new_professionactuelle'][type(df['new_professionactuelle']) == "NoneType"]
But got:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-215-f2597906f267> in <module>
----> 1 df['new_professionactuelle'][type(df['new_professionactuelle']) == "NoneType"]
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
866 key = com.apply_if_callable(key, self)
867 try:
--> 868 result = self.index.get_value(self, key)
869
870 if not is_scalar(result):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
4373 try:
4374 return self._engine.get_value(s, k,
-> 4375 tz=getattr(series.dtype, 'tz', None))
4376 except KeyError as e1:
4377 if len(self) > 0 and (self.holds_integer() or self.is_boolean()):
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index_class_helper.pxi in pandas._libs.index.Int64Engine._check_type()
KeyError: False
You can try this code to convert all text into english text.
import googletrans
from googletrans import Translator
translator = Translator()
def toenglish(x):
print(x)
result = translator.translate(x, dest='en')
return result.text
df['new_professionactuelle'] = list(map(toenglish, df['text']))

Need help passing date to pandas query

How do I pass the output of this prompt to a pandas search by date in excel?
import pandas as pd
TestedDateBegin = pd.to_datetime(input('Input date in mm-dd-yyyy format: '))
For example, if I input 2019-09-08 into above input prompt and run TestedDateBegin I get this output:
Timestamp('2019-09-08 00:00:00')
This search with the date hard coded works fine.
data = df.loc[df['emr_first_access_date'] >= '2019-09-08', ['site_name','subs_num','emr_id', ```'emr_first_access_date']]
But how do I pass the date inputted from the prompt so the user can search by any date?
This doesnt work:
data = df.loc[df['emr_first_access_date'] >= 'TestedDateBegin', ['site_name','subs_num','emr_id',
and throws a exception:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
pandas/_libs/tslibs/conversion.pyx in pandas._libs.tslibs.conversion.convert_str_to_tsobject()
pandas/_libs/tslibs/np_datetime.pyx in pandas._libs.tslibs.np_datetime._string_to_dts()
ValueError: Error parsing datetime string "TestedDateBegin" at position 0
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
pandas/_libs/tslibs/conversion.pyx in pandas._libs.tslibs.conversion.convert_str_to_tsobject()
pandas/_libs/tslibs/parsing.pyx in pandas._libs.tslibs.parsing.parse_datetime_string()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dateutil\parser\_parser.py in parse(timestr, parserinfo, **kwargs)
1357 else:
-> 1358 return DEFAULTPARSER.parse(timestr, **kwargs)
1359
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dateutil\parser\_parser.py in parse(self, timestr, default, ignoretz, tzinfos, **kwargs)
648 if res is None:
--> 649 raise ValueError("Unknown string format:", timestr)
650
ValueError: ('Unknown string format:', 'TestedDateBegin')
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\arrays\datetimes.py in wrapper(self, other)
144 try:
--> 145 other = _to_M8(other, tz=self.tz)
146 except ValueError:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\arrays\datetimes.py in _to_M8(key, tz)
77 # this also converts strings
---> 78 key = Timestamp(key)
79 if key.tzinfo is not None and tz is not None:
pandas/_libs/tslibs/timestamps.pyx in pandas._libs.tslibs.timestamps.Timestamp.__new__()
pandas/_libs/tslibs/conversion.pyx in pandas._libs.tslibs.conversion.convert_to_tsobject()
pandas/_libs/tslibs/conversion.pyx in pandas._libs.tslibs.conversion.convert_str_to_tsobject()
ValueError: could not convert string to Timestamp
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-2-702fd23c14bb> in <module>
----> 1 data = df.loc[df['emr_first_access_date'] >= 'TestedDateBegin', ['site_name','subs_num','emr_id', 'emr_first_access_date']]
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py in wrapper(self, other, axis)
1714
1715 res_values = dispatch_to_index_op(op, self, other,
-> 1716 pd.DatetimeIndex)
1717
1718 return self._constructor(res_values, index=self.index,
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py in dispatch_to_index_op(op, left, right, index_class)
1189 left_idx = left_idx._shallow_copy(freq=None)
1190 try:
-> 1191 result = op(left_idx, right)
1192 except NullFrequencyError:
1193 # DatetimeIndex and TimedeltaIndex with freq == None raise ValueError
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\datetimelike.py in wrapper(self, other)
115 other = other._values
116
--> 117 result = op(self._data, maybe_unwrap_index(other))
118 return result
119
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\arrays\datetimes.py in wrapper(self, other)
146 except ValueError:
147 # string that cannot be parsed to Timestamp
--> 148 return ops.invalid_comparison(self, other, op)
149
150 result = op(self.asi8, other.view('i8'))
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py in invalid_comparison(left, right, op)
1056 else:
1057 raise TypeError("Invalid comparison between dtype={dtype} and {typ}"
-> 1058 .format(dtype=left.dtype, typ=type(right).__name__))
1059 return res_values
1060
TypeError: Invalid comparison between dtype=datetime64[ns] and str
The error
TypeError: Invalid comparison between dtype=datetime64[ns] and str
tells that you try to compare datetime with string. To do it convert your string to datetime manually. In your case try:
from datetime import datetime
date = '2019-09-08'
date = datetime.strptime(date, '%Y-%m-%d')
To learn more information about date formatting see documentation

Resources