Conversion of a CSV file to different form - python-3.x

i have a CSV file with contents
01"815732013.0"1brand1"[100 76 64 ... 153 139 94]"
01"815732025.0"1female1"[183 192 201 ... 18 10 0]"
01"815732027.0"1male1"[204 214 221 ... 214 221 255]"
in one column
I need the contents in four columns like this,
col1 col2 col3 col4
01 "815732013.0" 1brand1 "[100 76 64 ... 153 139 94]"
01 "815732025.0" 1female1 "[183 192 201 ... 18 10 0]"
01 "815732027.0" 1male1 "[204 214 221 ... 214 221 255]"
How can i change this?
using python/excel/any other tools.

If you don't need to have to have the double quotes in the output file, then you should be fine with splitting the lines on the double quotes:
import csv
import io
text = '''01"815732013.0"1brand1"[100 76 64 ... 153 139 94]"
01"815732025.0"1female1"[183 192 201 ... 18 10 0]"
01"815732027.0"1male1"[204 214 221 ... 214 221 255]"'''
with io.StringIO(text) as f, open('output.csv', 'w') as of:
writer = csv.writer(of, delimiter=',', quotechar='"')
for line in f:
line = [r for r in line.strip().split('"') if r]
writer.writerow(line)
This snippet of code is pretty straightforward. You're basically splitting on the double quotes and discarding empty strings.
If you wish your output file to contain the quotes, then you may have to use some regular expression to capture the fields:
import csv
import io
import re
text = '''01"815732013.0"1brand1"[100 76 64 ... 153 139 94]"
01"815732025.0"1female1"[183 192 201 ... 18 10 0]"
01"815732027.0"1male1"[204 214 221 ... 214 221 255]"'''
with io.StringIO(text) as f, open('output.csv', 'w') as of:
pat = re.compile(r'(\d+)(\b".+"\b)(\w+)(\b".+"\b)')
writer = csv.writer(of, delimiter=',', quotechar='"')
for line in f:
line = pat.sub(r'\1;\2;\3;\4', line.strip()).split(';')
writer.writerow(line)
This is very similar to the previous snippet, with the only difference being the regular expression. The expression groups the different fields according to your desired output. Those groups are used to generated a set of row values which are passed to the writer.writerow method to write the row in your destination file.
I hope this proves useful.

Related

Huggingface tokenizer not able to load model after upgrading python to 3.10

I just updated Python to version 3.10.8. Note that I use JupyterLab.
I had to re-install a lot of packages, but now I get an error when I try to load the tokenizer of an HuggingFace model
This is my code:
# Import libraries
from transformers import pipeline, AutoTokenizer
# Define checkpoint
model_checkpoint = 'deepset/xlm-roberta-large-squad2'
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
Note that version of transformers is 4.24.0.
This is the error I get:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In [3], line 2
1 # Tokenizer
----> 2 tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
File ~/.local/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:637, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
635 tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
636 if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
--> 637 return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
638 else:
639 if tokenizer_class_py is not None:
File ~/.local/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1777, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
1774 else:
1775 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 1777 return cls._from_pretrained(
1778 resolved_vocab_files,
1779 pretrained_model_name_or_path,
1780 init_configuration,
1781 *init_inputs,
1782 use_auth_token=use_auth_token,
1783 cache_dir=cache_dir,
1784 local_files_only=local_files_only,
1785 _commit_hash=commit_hash,
1786 **kwargs,
1787 )
File ~/.local/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1932, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, local_files_only, _commit_hash, *init_inputs, **kwargs)
1930 # Instantiate tokenizer.
1931 try:
-> 1932 tokenizer = cls(*init_inputs, **init_kwargs)
1933 except OSError:
1934 raise OSError(
1935 "Unable to load vocabulary from file. "
1936 "Please check that the provided vocabulary is accessible and not corrupted."
1937 )
File ~/.local/lib/python3.10/site-packages/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py:155, in XLMRobertaTokenizerFast.__init__(self, vocab_file, tokenizer_file, bos_token, eos_token, sep_token, cls_token, unk_token, pad_token, mask_token, **kwargs)
139 def __init__(
140 self,
141 vocab_file=None,
(...)
151 ):
152 # Mask token behave like a normal word, i.e. include the space before it
153 mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
--> 155 super().__init__(
156 vocab_file,
157 tokenizer_file=tokenizer_file,
158 bos_token=bos_token,
159 eos_token=eos_token,
160 sep_token=sep_token,
161 cls_token=cls_token,
162 unk_token=unk_token,
163 pad_token=pad_token,
164 mask_token=mask_token,
165 **kwargs,
166 )
168 self.vocab_file = vocab_file
169 self.can_save_slow_tokenizer = False if not self.vocab_file else True
File ~/.local/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py:114, in PreTrainedTokenizerFast.__init__(self, *args, **kwargs)
111 fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
112 elif slow_tokenizer is not None:
113 # We need to convert a slow tokenizer to build the backend
--> 114 fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
115 elif self.slow_tokenizer_class is not None:
116 # We need to create and convert a slow tokenizer to build the backend
117 slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs)
File ~/.local/lib/python3.10/site-packages/transformers/convert_slow_tokenizer.py:1162, in convert_slow_tokenizer(transformer_tokenizer)
1154 raise ValueError(
1155 f"An instance of tokenizer class {tokenizer_class_name} cannot be converted in a Fast tokenizer instance."
1156 " No converter was found. Currently available slow->fast convertors:"
1157 f" {list(SLOW_TO_FAST_CONVERTERS.keys())}"
1158 )
1160 converter_class = SLOW_TO_FAST_CONVERTERS[tokenizer_class_name]
-> 1162 return converter_class(transformer_tokenizer).converted()
File ~/.local/lib/python3.10/site-packages/transformers/convert_slow_tokenizer.py:438, in SpmConverter.__init__(self, *args)
434 requires_backends(self, "protobuf")
436 super().__init__(*args)
--> 438 from .utils import sentencepiece_model_pb2 as model_pb2
440 m = model_pb2.ModelProto()
441 with open(self.original_tokenizer.vocab_file, "rb") as f:
File ~/.local/lib/python3.10/site-packages/transformers/utils/sentencepiece_model_pb2.py:20
18 from google.protobuf import descriptor as _descriptor
19 from google.protobuf import message as _message
---> 20 from google.protobuf import reflection as _reflection
21 from google.protobuf import symbol_database as _symbol_database
24 # ##protoc_insertion_point(imports)
File /usr/lib/python3/dist-packages/google/protobuf/reflection.py:58
56 from google.protobuf.pyext import cpp_message as message_impl
57 else:
---> 58 from google.protobuf.internal import python_message as message_impl
60 # The type of all Message classes.
61 # Part of the public interface, but normally only used by message factories.
62 GeneratedProtocolMessageType = message_impl.GeneratedProtocolMessageType
File /usr/lib/python3/dist-packages/google/protobuf/internal/python_message.py:69
66 import copyreg as copyreg
68 # We use "as" to avoid name collisions with variables.
---> 69 from google.protobuf.internal import containers
70 from google.protobuf.internal import decoder
71 from google.protobuf.internal import encoder
File /usr/lib/python3/dist-packages/google/protobuf/internal/containers.py:182
177 collections.MutableMapping.register(MutableMapping)
179 else:
180 # In Python 3 we can just use MutableMapping directly, because it defines
181 # __slots__.
--> 182 MutableMapping = collections.MutableMapping
185 class BaseContainer(object):
187 """Base container class."""
AttributeError: module 'collections' has no attribute 'MutableMapping'
I tried several solutions (for example, this and this), but none seem to work.
According to this link, I should change collections.Mapping into collections.abc.Mapping, but I wouldn't knwo where to do it.
Another possible solution is downgrading Python to 3.9, but I would like to keep it as last resort.
How can I fix this?
Turned out it was a problem related to protobuf module. I updated it to the latest version to date (which is 4.21.9).
This changed the error to:
TypeError: Descriptors cannot not be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
1. Downgrade the protobuf package to 3.20.x or lower.
2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).
More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates
So I downgraded protobuf to version 3.20.0 and that worked.
For further details, look here.

Cytoscape: How do you import ABC file types with py2cytoscape's cyrest api?

I have a file of the type:
A B 0.123
A C 0.84
B D 0.52
...
Where the data are tab separated, and the first and second columns are the nodes, and the third is the associated edge weight.
When trying to import this file into cytoscape using py2cytoscape, I'm receiving an error:
from py2cytoscape import cyrest
fileName="/Users/96v/Documents/lco/lcoAllAt25/lcoAll25/lcoAll25_top0.041pct_data/lcoAll25_top0.041pct.txt"
cyclient = cyrest.cyclient()
cyclient.network.import_file(dataTypeList='string,string,double',
afile=fileName,
delimiters='\t',
indexColumnSourceInteraction="0",
indexColumnTargetInteraction="1",
verbose=True)
'http://localhost:1234/v1/commands/network/import file'
TypeError Traceback (most recent call last)
in
----> 1 cyclient.network.import_file(dataTypeList='string,string,double', afile=fileName, delimiters='\t', indexColumnSourceInteraction="0", indexColumnTargetInteraction="1", defaultInteraction="Edge Attribute",verbose=True)
2
~/opt/anaconda3/lib/python3.8/site-packages/py2cytoscape/cyrest/network.py in import_file(self, dataTypeList, defaultInteraction, delimiters, delimitersForDataList, afile, firstRowAsColumnNames, indexColumnSourceInteraction, indexColumnTargetInteraction, indexColumnTypeInteraction, NetworkViewRendererList, RootNetworkList, startLoadRow, TargetColumnList, verbose)
464 afile,firstRowAsColumnNames,indexColumnSourceInteraction,indexColumnTargetInteraction,
465 indexColumnTypeInteraction,NetworkViewRendererList,RootNetworkList,startLoadRow,TargetColumnList])
--> 466 response=api(url=self.__url+"/import file", PARAMS=PARAMS, method="POST", verbose=verbose)
467 return response
468
~/opt/anaconda3/lib/python3.8/site-packages/py2cytoscape/cyrest/base.py in api(namespace, command, PARAMS, body, host, port, version, method, verbose, url, parse_params)
139 sys.stdout.flush()
140 r = requests.post(url = baseurl, json = PARAMS)
--> 141 verbose_=checkresponse(r, verbose=verbose)
142 if (verbose) or (verbose_):
143 verbose=True
~/opt/anaconda3/lib/python3.8/site-packages/py2cytoscape/cyrest/base.py in checkresponse(r, verbose)
43 if 200 <= status < 300:
44 if verbose:
---> 45 print("response status "+status)
46 sys.stdout.flush()
47 res=None
TypeError: can only concatenate str (not "int") to str
The edge weights aren't being recognized, yet the documentation isn't as verbose for this function.
Any help would be extremely appreciated!
After looking further at the GUI, I realized:
Columns are not 0 indexed.
Verbose has an error in it.
The below code works fine:
from py2cytoscape import cyrest
fileName="pathToFile"
cyclient = cyrest.cyclient()
collection = cyclient.network.import_file(dataTypeList='string,string,double',
afile=fileName,
delimiters='\t',
indexColumnSourceInteraction="1",
indexColumnTargetInteraction="2",
defaultInteraction="interacts with")

Tweet-Cleaning by removing b' and ASCII - can't find the problem

I am currently preprocessing tweets, extracted via Twitter API and saved as csv. Within the csv there are some characters like "b'" at the beginning of the tweet and code like aren\xe2\x80\x99t, which stands for "'". Now I want to remove these chars but don't know how although I have tried it a couple of times. Can anyone help me? I read the file with pandas and Python3. The column is called "text"
What I mean is the following:
b'RT #username: some text some text C\xe2\x80\xa6' OR
"b'RT #username: some text some text .A\xe2\x80\xa6'
Input 1:
df = pd.read_csv('Data/test.csv', encoding= 'utf8')
df['text'] = df['text'].str.replace('b[\s]+', ' ')
df['text'] = df['text'].str.replace('[^\x00-\x7F]+',' ')
df['text'] = df['text'].str.replace('[^\u0000-\uD7FF\uE000-\uFFFF]',' ')
Output 1: Nothing happens.
With the next snippet I tried to apply the UTF-8 encoding. As I am write this need sometimes to be done for further processing.
Input 2:
df = pd.read_csv('Data/Result_w8_Pfizer_en_test.csv', encoding= 'utf8')
df.apply(lambda x: pd.lib.infer_dtype(x.values))
Output 2:
AttributeError Traceback (most recent call last)
<ipython-input-50-4c6bdb11d736> in <module>
25
26 df = pd.read_csv('Data/test.csv', encoding= 'utf8') # dtype=string
---> 27 df.apply(lambda x: pd.lib.infer_dtype(x.values))
28
29
~/conda/lib/python3.6/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
6485 args=args,
6486 kwds=kwds)
-> 6487 return op.get_result()
6488
6489 def applymap(self, func):
~/conda/lib/python3.6/site-packages/pandas/core/apply.py in get_result(self)
149 return self.apply_raw()
150
--> 151 return self.apply_standard()
152
153 def apply_empty_result(self):
~/conda/lib/python3.6/site-packages/pandas/core/apply.py in apply_standard(self)
255
256 # compute the result using the series generator
--> 257 self.apply_series_generator()
258
259 # wrap results
~/conda/lib/python3.6/site-packages/pandas/core/apply.py in apply_series_generator(self)
284 try:
285 for i, v in enumerate(series_gen):
--> 286 results[i] = self.f(v)
287 keys.append(v.name)
288 except Exception as e:
<ipython-input-50-4c6bdb11d736> in <lambda>(x)
25
26 df = pd.read_csv('Data/test.csv', encoding= 'utf8')
---> 27 df.apply(lambda x: pd.lib.infer_dtype(x.values))
28
29
AttributeError: ("module 'pandas' has no attribute 'lib'", 'occurred at index date')
Here I did some research but couldn't find out the issue or how to solve it.

OptionError: "No such keys(s): 'display.height'"

So I have been trying to figure this out but my pandas 'display.height' option is not working and comes up with OptionError: "No such keys(s): 'display.height'". Here is what I have done.
import pandas as pd
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
The code returns the following error:
C:\Anaconda3\lib\site-packages\pandas\core\config.py in __call__(self, *args, **kwds)
225
226 def __call__(self, *args, **kwds):
--> 227 return self.__func__(*args, **kwds)
228
229 #property
C:\Anaconda3\lib\site-packages\pandas\core\config.py in _set_option(*args, **kwargs)
117
118 for k, v in zip(args[::2], args[1::2]):
--> 119 key = _get_single_key(k, silent)
120
121 o = _get_registered_option(key)
C:\Anaconda3\lib\site-packages\pandas\core\config.py in _get_single_key(pat, silent)
81 if not silent:
82 _warn_if_deprecated(pat)
---> 83 raise OptionError('No such keys(s): {pat!r}'.format(pat=pat))
84 if len(keys) > 1:
85 raise OptionError('Pattern matched multiple keys')
OptionError: "No such keys(s): 'display.height'"
Do you have any suggestions how to correct this?
According to pandas 0.15 document, you should use display.max_rows to replace display.height.
display.height : int
Deprecated. [default: 60] [currently: 15] (Deprecated, use display.max_rows instead.)

how to run auto.arima using rpy2

I want to call R's auto.arima function from Python. I think i have not yet fully understood this interface. Can someone help me here - to send a time series obj to R, call forecast related functions and get back the results?
This is what I have done so far:
from rpy2.robjects import r
from rpy2.robjects import pandas2ri
#create a python time series
count = range(1, 51)
df['count'] = count
df['date'] = pd.date_range('2016-01-01', '2016-02-19')
df.set_index('date', inlace = True)
df.sort_index(inplace = True)
pandas2ri.activate()
r_timeseries = pandas2ri.py2ri(df)
r('fit <- auto.arima(r_timeseries)')
I think I have to import some R packages (like forecast). Not sure how to go about doing that in Python, properly pass the python time series object to R etc.
In [63]: r_ts = pandas2ri.py2ri(df)
In [64]: r_ts
Out[64]:
<DataFrame - Python:0x1126a93f8 / R:0x7ff7bfa51bc8>
[IntVector]
X0: <class 'rpy2.robjects.vectors.IntVector'>
<IntVector - Python:0x1126a96c8 / R:0x7ff7be1af1c0>
[ 1, 2, 3, ..., 48, 49, 50]
And, when I attempt to call forecast
In [83]: x = r('forecast(r_ts)')
/Library/Python/2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Error in forecast(r_ts) : object 'r_ts' not found
res = super(Function, self).__call__(*new_args, **new_kwargs)
---------------------------------------------------------------------------
RRuntimeError Traceback (most recent call last)
<ipython-input-83-0765ffc30741> in <module>()
----> 1 x = r('forecast(r_ts)')
/Library/Python/2.7/site-packages/rpy2/robjects/__init__.pyc in __call__(self, string)
319 def __call__(self, string):
320 p = _rparse(text=StrSexpVector((string,)))
--> 321 res = self.eval(p)
322 return conversion.ri2py(res)
323
/Library/Python/2.7/site-packages/rpy2/robjects/functions.pyc in __call__(self, *args, **kwargs)
176 v = kwargs.pop(k)
177 kwargs[r_k] = v
--> 178 return super(SignatureTranslatedFunction, self).__call__(*args, **kwargs)
179
180 pattern_link = re.compile(r'\\link\{(.+?)\}')
/Library/Python/2.7/site-packages/rpy2/robjects/functions.pyc in __call__(self, *args, **kwargs)
104 for k, v in kwargs.items():
105 new_kwargs[k] = conversion.py2ri(v)
--> 106 res = super(Function, self).__call__(*new_args, **new_kwargs)
107 res = conversion.ri2ro(res)
108 return res
RRuntimeError: Error in forecast(r_ts) : object 'r_ts' not found
I tried the following as well:
In [99]: f = r('forecast.auto.arima(r_ts)')
---------------------------------------------------------------------------
RRuntimeError Traceback (most recent call last)
<ipython-input-99-1c4610d2740d> in <module>()
----> 1 f = r('forecast.auto.arima(r_ts)')
/Library/Python/2.7/site-packages/rpy2/robjects/__init__.pyc in __call__(self, string)
319 def __call__(self, string):
320 p = _rparse(text=StrSexpVector((string,)))
--> 321 res = self.eval(p)
322 return conversion.ri2py(res)
323
/Library/Python/2.7/site-packages/rpy2/robjects/functions.pyc in __call__(self, *args, **kwargs)
176 v = kwargs.pop(k)
177 kwargs[r_k] = v
--> 178 return super(SignatureTranslatedFunction, self).__call__(*args, **kwargs)
179
180 pattern_link = re.compile(r'\\link\{(.+?)\}')
/Library/Python/2.7/site-packages/rpy2/robjects/functions.pyc in __call__(self, *args, **kwargs)
104 for k, v in kwargs.items():
105 new_kwargs[k] = conversion.py2ri(v)
--> 106 res = super(Function, self).__call__(*new_args, **new_kwargs)
107 res = conversion.ri2ro(res)
108 return res
RRuntimeError: Error in eval(expr, envir, enclos) :
could not find function "forecast.auto.arima"
you could try what I do
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
pandas2ri.activate()
ro.r('library(forecast)')
rdf = pandas2ri.py2ri(df)
ro.globalenv['r_timeseries'] = rdf
pred = ro.r('as.data.frame(forecast(auto.arima(r_timeseries),h=5))')
this way, you can handle pred as a data frame like this
Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
51 51 51 51 51 51
52 52 52 52 52 52
53 53 53 53 53 53
54 54 54 54 54 54
55 55 55 55 55 55
In the first attempt you are telling R to use a variable r_ts that it does not now much about (the name r_ts is defined in your Python namespace), and in the second attempt you are added to this a function name R does not know anything about. Both error message are precisely reporting this as a problem.
Your first attempt could be rewritten as:
x = r('forecast')(r_ts)

Resources