Finding emails by subject name with Python 3 - python-3.x

I'm trying to pull emails in my inbox with a certain subject name by using the following code:
import imapclient
import pprint
imapObj = imapclient.IMAPClient('imap-mail.outlook.com',ssl=True)
imapObj.login('personalemail#outlook.com','strongpassword')
imapObj.select_folder('INBOX',readonly=True)
imapObj.search('SUBJECT Broker Dealer Fails Report – NY')
The error I'm getting is:
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-35-d172a6d61d89> in <module>
1 imapObj.select_folder('INBOX',readonly=True)
----> 2 imapObj.search('SUBJECT Broker Dealer Fails Report – NY')
C:\ProgramData\Anaconda3\lib\site-packages\imapclient\imapclient.py in search(self, criteria, charset)
954
955 """
--> 956 return self._search(criteria, charset)
957
958 #require_capability('X-GM-EXT-1')
C:\ProgramData\Anaconda3\lib\site-packages\imapclient\imapclient.py in _search(self, criteria, charset)
977 if charset:
978 args.extend([b'CHARSET', to_bytes(charset)])
--> 979 args.extend(_normalise_search_criteria(criteria, charset))
980
981 try:
C:\ProgramData\Anaconda3\lib\site-packages\imapclient\imapclient.py in _normalise_search_criteria(criteria, charset)
1614
1615 if isinstance(criteria, (text_type, binary_type)):
-> 1616 return [to_bytes(criteria, charset)]
1617
1618 out = []
C:\ProgramData\Anaconda3\lib\site-packages\imapclient\util.py in to_bytes(s, charset)
28 def to_bytes(s, charset='ascii'):
29 if isinstance(s, text_type):
---> 30 return s.encode(charset)
31 return s
32
UnicodeEncodeError: 'ascii' codec can't encode character '\u2013' in position 35: ordinal not in range(128)
I have tried different combinations of using lists in the parenthesis ([]), using '' and "" for the name of the subject.

Subject should start with "Subject:" try this and see if it works

Related

Scraping multiple wikitables using Python

I am very beginner to Python. I have a task to scrape information table from wikipedia page. I would like to scrape using the below code:
from pandas.io.html import read_html
page = requests.get('https://de.wikipedia.org/wiki/Köln')
wikitables = read_html(page, attrs={"class":"hintergrundfarbe5 float-right toptextcells infobox"})
print("Extracted {num} wikitables".format(num=len(wikitables)))
wikitables[0]
But I get the below error due to the special character in the Url as Köln: Please help me where to do the modifications in the program to scrape the information.
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-168-d9bd1e1d7548> in <module>
2 page = requests.get('https://de.wikipedia.org/wiki/Köln')
3 Soup = BeautifulSoup(page.content)
----> 4 wikitables = read_html(page, attrs={"class":"hintergrundfarbe5 float-right toptextcells infobox"})
5 print("Extracted {num} wikitables".format(num=len(wikitables)))
6
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\html.py in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, tupleize_cols, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only)
1092 decimal=decimal, converters=converters, na_values=na_values,
1093 keep_default_na=keep_default_na,
-> 1094 displayed_only=displayed_only)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\html.py in _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs)
914 break
915 else:
--> 916 raise_with_traceback(retained)
917
918 ret = []
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\compat\__init__.py in raise_with_traceback(exc, traceback)
418 if traceback == Ellipsis:
419 _, _, traceback = sys.exc_info()
--> 420 raise exc.with_traceback(traceback)
421 else:
422 # this version of raise is a syntax error in Python 3
TypeError: Cannot read object of type 'Response'
This has nothing to do with beautiful Köln...
You need to change
wikitables = read_html(page, attrs={"..."})
to
wikitables = read_html(page.text, attrs={"..."})
and it should work.

Need help passing date to pandas query

How do I pass the output of this prompt to a pandas search by date in excel?
import pandas as pd
TestedDateBegin = pd.to_datetime(input('Input date in mm-dd-yyyy format: '))
For example, if I input 2019-09-08 into above input prompt and run TestedDateBegin I get this output:
Timestamp('2019-09-08 00:00:00')
This search with the date hard coded works fine.
data = df.loc[df['emr_first_access_date'] >= '2019-09-08', ['site_name','subs_num','emr_id', ```'emr_first_access_date']]
But how do I pass the date inputted from the prompt so the user can search by any date?
This doesnt work:
data = df.loc[df['emr_first_access_date'] >= 'TestedDateBegin', ['site_name','subs_num','emr_id',
and throws a exception:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
pandas/_libs/tslibs/conversion.pyx in pandas._libs.tslibs.conversion.convert_str_to_tsobject()
pandas/_libs/tslibs/np_datetime.pyx in pandas._libs.tslibs.np_datetime._string_to_dts()
ValueError: Error parsing datetime string "TestedDateBegin" at position 0
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
pandas/_libs/tslibs/conversion.pyx in pandas._libs.tslibs.conversion.convert_str_to_tsobject()
pandas/_libs/tslibs/parsing.pyx in pandas._libs.tslibs.parsing.parse_datetime_string()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dateutil\parser\_parser.py in parse(timestr, parserinfo, **kwargs)
1357 else:
-> 1358 return DEFAULTPARSER.parse(timestr, **kwargs)
1359
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dateutil\parser\_parser.py in parse(self, timestr, default, ignoretz, tzinfos, **kwargs)
648 if res is None:
--> 649 raise ValueError("Unknown string format:", timestr)
650
ValueError: ('Unknown string format:', 'TestedDateBegin')
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\arrays\datetimes.py in wrapper(self, other)
144 try:
--> 145 other = _to_M8(other, tz=self.tz)
146 except ValueError:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\arrays\datetimes.py in _to_M8(key, tz)
77 # this also converts strings
---> 78 key = Timestamp(key)
79 if key.tzinfo is not None and tz is not None:
pandas/_libs/tslibs/timestamps.pyx in pandas._libs.tslibs.timestamps.Timestamp.__new__()
pandas/_libs/tslibs/conversion.pyx in pandas._libs.tslibs.conversion.convert_to_tsobject()
pandas/_libs/tslibs/conversion.pyx in pandas._libs.tslibs.conversion.convert_str_to_tsobject()
ValueError: could not convert string to Timestamp
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-2-702fd23c14bb> in <module>
----> 1 data = df.loc[df['emr_first_access_date'] >= 'TestedDateBegin', ['site_name','subs_num','emr_id', 'emr_first_access_date']]
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py in wrapper(self, other, axis)
1714
1715 res_values = dispatch_to_index_op(op, self, other,
-> 1716 pd.DatetimeIndex)
1717
1718 return self._constructor(res_values, index=self.index,
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py in dispatch_to_index_op(op, left, right, index_class)
1189 left_idx = left_idx._shallow_copy(freq=None)
1190 try:
-> 1191 result = op(left_idx, right)
1192 except NullFrequencyError:
1193 # DatetimeIndex and TimedeltaIndex with freq == None raise ValueError
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\datetimelike.py in wrapper(self, other)
115 other = other._values
116
--> 117 result = op(self._data, maybe_unwrap_index(other))
118 return result
119
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\arrays\datetimes.py in wrapper(self, other)
146 except ValueError:
147 # string that cannot be parsed to Timestamp
--> 148 return ops.invalid_comparison(self, other, op)
149
150 result = op(self.asi8, other.view('i8'))
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py in invalid_comparison(left, right, op)
1056 else:
1057 raise TypeError("Invalid comparison between dtype={dtype} and {typ}"
-> 1058 .format(dtype=left.dtype, typ=type(right).__name__))
1059 return res_values
1060
TypeError: Invalid comparison between dtype=datetime64[ns] and str
The error
TypeError: Invalid comparison between dtype=datetime64[ns] and str
tells that you try to compare datetime with string. To do it convert your string to datetime manually. In your case try:
from datetime import datetime
date = '2019-09-08'
date = datetime.strptime(date, '%Y-%m-%d')
To learn more information about date formatting see documentation

Tweet-Cleaning by removing b' and ASCII - can't find the problem

I am currently preprocessing tweets, extracted via Twitter API and saved as csv. Within the csv there are some characters like "b'" at the beginning of the tweet and code like aren\xe2\x80\x99t, which stands for "'". Now I want to remove these chars but don't know how although I have tried it a couple of times. Can anyone help me? I read the file with pandas and Python3. The column is called "text"
What I mean is the following:
b'RT #username: some text some text C\xe2\x80\xa6' OR
"b'RT #username: some text some text .A\xe2\x80\xa6'
Input 1:
df = pd.read_csv('Data/test.csv', encoding= 'utf8')
df['text'] = df['text'].str.replace('b[\s]+', ' ')
df['text'] = df['text'].str.replace('[^\x00-\x7F]+',' ')
df['text'] = df['text'].str.replace('[^\u0000-\uD7FF\uE000-\uFFFF]',' ')
Output 1: Nothing happens.
With the next snippet I tried to apply the UTF-8 encoding. As I am write this need sometimes to be done for further processing.
Input 2:
df = pd.read_csv('Data/Result_w8_Pfizer_en_test.csv', encoding= 'utf8')
df.apply(lambda x: pd.lib.infer_dtype(x.values))
Output 2:
AttributeError Traceback (most recent call last)
<ipython-input-50-4c6bdb11d736> in <module>
25
26 df = pd.read_csv('Data/test.csv', encoding= 'utf8') # dtype=string
---> 27 df.apply(lambda x: pd.lib.infer_dtype(x.values))
28
29
~/conda/lib/python3.6/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
6485 args=args,
6486 kwds=kwds)
-> 6487 return op.get_result()
6488
6489 def applymap(self, func):
~/conda/lib/python3.6/site-packages/pandas/core/apply.py in get_result(self)
149 return self.apply_raw()
150
--> 151 return self.apply_standard()
152
153 def apply_empty_result(self):
~/conda/lib/python3.6/site-packages/pandas/core/apply.py in apply_standard(self)
255
256 # compute the result using the series generator
--> 257 self.apply_series_generator()
258
259 # wrap results
~/conda/lib/python3.6/site-packages/pandas/core/apply.py in apply_series_generator(self)
284 try:
285 for i, v in enumerate(series_gen):
--> 286 results[i] = self.f(v)
287 keys.append(v.name)
288 except Exception as e:
<ipython-input-50-4c6bdb11d736> in <lambda>(x)
25
26 df = pd.read_csv('Data/test.csv', encoding= 'utf8')
---> 27 df.apply(lambda x: pd.lib.infer_dtype(x.values))
28
29
AttributeError: ("module 'pandas' has no attribute 'lib'", 'occurred at index date')
Here I did some research but couldn't find out the issue or how to solve it.

how to remove nameerror Traceback (most recent call last)

I am making a series using three dictionaries, in dictionary, there is no keyword or values "name"/ "null".but it is showing NameError: name 'null' is not defined
rerun the code in different jupyter notebook
import pandas as pd
p1=pd.Series({'team':'england','keyplayer':'joe root','bowler':'jofra'})
p2=pd.Series({'team':'india','keyplayer':'virat kohli','bowler':'bumhra'})
p3=pd.Series({'team':'australia','keyplayer':'steve smith','bowler':'starc'})
df=pd.DataFrame([p1,p2,p3],index=['1','2','3'])
df.head()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\IPython\core\formatters.py in __call__(self, obj)
700 type_pprinters=self.type_printers,
701 deferred_pprinters=self.deferred_printers)
--> 702 printer.pretty(obj)
703 printer.flush()
704 return stream.getvalue()
~\Anaconda3\lib\site-packages\IPython\lib\pretty.py in pretty(self, obj)
400 if cls is not object \
401 and callable(cls.__dict__.get('__repr__')):
--> 402 return _repr_pprint(obj, self, cycle)
403
404 return _default_pprint(obj, self, cycle)
~\Anaconda3\lib\site-packages\IPython\lib\pretty.py in _repr_pprint(obj, p, cycle)
695 """A pprint that just redirects to the normal repr function."""
696 # Find newlines and replace them with p.break_()
--> 697 output = repr(obj)
698 for idx,output_line in enumerate(output.splitlines()):
699 if idx:
~\Anaconda3\lib\site-packages\pandas\core\base.py in __repr__(self)
80 Yields Bytestring in Py2, Unicode String in py3.
81 """
---> 82 return str(self)
83
84
~\Anaconda3\lib\site-packages\pandas\core\base.py in __str__(self)
59
60 if compat.PY3:
---> 61 return self.__unicode__()
62 return self.__bytes__()
63
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __unicode__(self)
661 width = None
662 self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
--> 663 line_width=width, show_dimensions=show_dimensions)
664
665 return buf.getvalue()
NameError: name 'null' is not defined
----------------after this table is showing
I tried to run your code in an isolated environment and I managed!
IPython:
In [1]: import pandas as pd
In [2]: p1=pd.Series({'team':'england','keyplayer':'joe root','bowler':'jofra'})
: p2=pd.Series({'team':'india','keyplayer':'virat kohli','bowler':'bumhra'})
: p3=pd.Series({'team':'australia','keyplayer':'steve smith','bowler':'starc'})
: df=pd.DataFrame([p1,p2,p3],index=['1','2','3'])
In [3]: df.head()
Out[3]:
team keyplayer bowler
1 england joe root jofra
2 india virat kohli bumhra
3 australia steve smith starc
I installed the minimum required to run: numpy==1.16.4, pandas==0.24.2 e jupyter==1.0.0
Maybe there is some problem with your libs.
I recommend you try to run your code using virtualenv and install the required libraries.
To learn more, nothing better than the Python documentation itself: https://docs.python.org/3/tutorial/venv.html

AttributeError: Can only use .dt accessor with datetimelike values in 0yrs 0mon format

I am trying converting date string format to numeric, but I get some error,
my date column like this :
train['AVERAGE_ACCT_AGE'].head(6)
0 0yrs 0mon
1 1yrs 11mon
2 0yrs 0mon
3 0yrs 8mon
4 0yrs 0mon
5 1yrs 9mon
Name: AVERAGE_ACCT_AGE, dtype: object
I tried this code to add DateTime format to that variable.
train['AVERAGE_ACCT_AGE']=pd.to_datetime(train['AVERAGE.ACCT.AGE'], format='%Y%m')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\tools\datetimes.py in _convert_listlike(arg, box, format, name, tz)
376 try:
--> 377 values, tz = conversion.datetime_to_datetime64(arg)
378 return DatetimeIndex._simple_new(values, name=name, tz=tz)
pandas\_libs\tslibs\conversion.pyx in pandas._libs.tslibs.conversion.datetime_to_datetime64()
TypeError: Unrecognized value type: <class 'str'>
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-49-13f5c298f460> in <module>()
----> 1 train['AVERAGE_ACCT_AGE']=pd.to_datetime(train['AVERAGE.ACCT.AGE'], format='%Y-%m')
~\Anaconda3\lib\site-packages\pandas\core\tools\datetimes.py in to_datetime(arg, errors, dayfirst, yearfirst, utc, box, format, exact, unit, infer_datetime_format, origin, cache)
449 else:
450 from pandas import Series
--> 451 values = _convert_listlike(arg._values, True, format)
452 result = Series(values, index=arg.index, name=arg.name)
453 elif isinstance(arg, (ABCDataFrame, MutableMapping)):
~\Anaconda3\lib\site-packages\pandas\core\tools\datetimes.py in _convert_listlike(arg, box, format, name, tz)
378 return DatetimeIndex._simple_new(values, name=name, tz=tz)
379 except (ValueError, TypeError):
--> 380 raise e
381
382 if arg is None:
~\Anaconda3\lib\site-packages\pandas\core\tools\datetimes.py in _convert_listlike(arg, box, format, name, tz)
366 dayfirst=dayfirst,
367 yearfirst=yearfirst,
--> 368 require_iso8601=require_iso8601
369 )
370
pandas\_libs\tslib.pyx in pandas._libs.tslib.array_to_datetime()
pandas\_libs\tslib.pyx in pandas._libs.tslib.array_to_datetime()
ValueError: time data 0yrs 0mon doesn't match format specified
After that, I tried this code to added error ignore to the column.
train['AVERAGE_ACCT_AGE']=pd.to_datetime(train['AVERAGE.ACCT.AGE'], format='%Y%m',errors='ignore',infer_datetime_format=True)
Its added datetime format then I this code
train['yrs']=train['AVERAGE_ACCT_AGE'].dt.year
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-50-39b8c6e07f77> in <module>()
----> 1 train['yrs']=train['AVERAGE_ACCT_AGE'].dt.year
~\Anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
4366 if (name in self._internal_names_set or name in self._metadata or
4367 name in self._accessors):
-> 4368 return object.__getattribute__(self, name)
4369 else:
4370 if self._info_axis._can_hold_identifiers_and_holds_name(name):
~\Anaconda3\lib\site-packages\pandas\core\accessor.py in __get__(self, obj, cls)
130 # we're accessing the attribute of the class, i.e., Dataset.geo
131 return self._accessor
--> 132 accessor_obj = self._accessor(obj)
133 # Replace the property with the accessor object. Inspired by:
134 # http://www.pydanny.com/cached-property.html
~\Anaconda3\lib\site-packages\pandas\core\indexes\accessors.py in __new__(cls, data)
323 pass # we raise an attribute error anyway
324
--> 325 raise AttributeError("Can only use .dt accessor with datetimelike "
326 "values")
please help me how to convert object type to numeric type. I want years and months of columns separately.
AttributeError: Can only use .dt accessor with datetimelike values
The column is not of Datetime format.
Here is a quick way to get it to numeric.
I am using more lines than needed.
# doing this so we can have it in string format
train['AVERAGE_ACCT_AGE'] = train['AVERAGE_ACCT_AGE'].astype(str)
#Now remove the trailing or any such spaces
train['AVERAGE_ACCT_AGE'] = train['AVERAGE_ACCT_AGE'].map(lambda x: x.strip())
#Next we split and expand the column into 2 columns:
train[['yrs','months']] = train['AVERAGE_ACCT_AGE'].str.split(' ',n=1,expand=True)
#remove characters from new columns,
#I am assuming the characters remain the same
train['yrs'] = train['yrs'].str.replace('yrs','')
train['months'] = train['months'].str.replace('mon','')
# Convert yrs to float
train['yrs'] = train['yrs'].astype('float')
# Convert months to float
train['months'] = train['yrs'].astype('float')
Hope it helps.

Resources