Gensim fasttext wrapper returns permission error 13 while model training - file-permissions

I tried to reproduce this tutorial on my local machine to get used to gensim fasttext functionalities. Fasttext and gensim libraries are correctly installed. By calling the train method of gensim fasttext wrapper
model_wrapper = FT_wrapper.train(ft_home, lee_train_file)
I get the following error:
---------------------------------------------------------------------------
PermissionError Traceback (most recent call last)
<ipython-input-19-0815ab031d23> in <module>()
3
4 # train the model
----> 5 model_wrapper = FT_wrapper.train(ft_home, lee_train_file)
6
7 print(model_wrapper)
~/anaconda3/lib/python3.6/site-packages/gensim/models/deprecated/fasttext_wrapper.py in train(cls, ft_path, corpus_file, output_file, model, size, alpha, window, min_count, word_ngrams, loss, sample, negative, iter, min_n, max_n, sorted_vocab, threads)
240 cmd.append(str(value))
241
--> 242 utils.check_output(args=cmd)
243 model = cls.load_fasttext_format(output_file)
244 cls.delete_training_files(output_file)
~/anaconda3/lib/python3.6/site-packages/gensim/utils.py in check_output(stdout, *popenargs, **kwargs)
1795 try:
1796 logger.debug("COMMAND: %s %s", popenargs, kwargs)
-> 1797 process = subprocess.Popen(stdout=stdout, *popenargs, **kwargs)
1798 output, unused_err = process.communicate()
1799 retcode = process.poll()
~/anaconda3/lib/python3.6/subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors)
707 c2pread, c2pwrite,
708 errread, errwrite,
--> 709 restore_signals, start_new_session)
710 except:
711 # Cleanup if the child failed starting.
~/anaconda3/lib/python3.6/subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)
1342 if errno_num == errno.ENOENT:
1343 err_msg += ': ' + repr(err_filename)
-> 1344 raise child_exception_type(errno_num, err_msg, err_filename)
1345 raise child_exception_type(err_msg)
1346
PermissionError: [Errno 13] Permission denied: '/Users/marcomattioli/fastText'
Note that I have -rwxr-xr-x rights on the fasttext executable. Any help appreciated how to fix this.

The methods is deprecated.
Use the follow codes:
from gensim.models.fasttext import FastText
from gensim.test.utils import datapath
corpus_file = datapath('lee_background.cor') # absolute path to corpus
model3 = FastText(size=4, window=3, min_count=1)
model3.build_vocab(corpus_file=corpus_file)
total_words = model3.corpus_total_words
model3.train(corpus_file=corpus_file, total_words=total_words, epochs=10)

Related

(linux) jupyter notebook cites file : permission denied

I'm running alphafold.
My jupyter notebook wants to visit a file, and shows 'permission denied'
---------------------------------------------------------------------------
PermissionError Traceback (most recent call last)
/tmp/ipykernel_2436/4236937636.py in <cell line: 129>()
136 # Don't do redundant work for multiple copies of the same chain in the multimer.
137 if sequence not in raw_msa_results_for_sequence:
--> 138 raw_msa_results = get_msa(fasta_path=fasta_path)
139 raw_msa_results_for_sequence[sequence] = raw_msa_results
140 else:
/tmp/ipykernel_2436/4236937636.py in get_msa(fasta_path)
120 z_value=db_config['z_value'])
121 # Group the results by database name.
--> 122 raw_msa_results[db_name].extend(jackhmmer_runner.query(fasta_path))
123
124 return raw_msa_results
~/Alphafold/alphafold/alphafold/data/tools/jackhmmer.py in query(self, input_fasta_path, max_sequences)
198 # Run Jackhmmer with the chunk
199 future.result()
--> 200 chunked_output.append(self._query_chunk(
201 input_fasta_path, db_local_chunk(i), max_sequences))
202
~/Alphafold/alphafold/alphafold/data/tools/jackhmmer.py in _query_chunk(self, input_fasta_path, database_path, max_sequences)
132
133 logging.info('Launching subprocess "%s"', ' '.join(cmd))
--> 134 process = subprocess.Popen(
135 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
136 with utils.timing(
~/.conda/envs/default/lib/python3.9/subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask)
949 encoding=encoding, errors=errors)
950
--> 951 self._execute_child(args, executable, preexec_fn, close_fds,
952 pass_fds, cwd, env,
953 startupinfo, creationflags, shell,
~/.conda/envs/default/lib/python3.9/subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, gid, gids, uid, umask, start_new_session)
1819 if errno_num != 0:
1820 err_msg = os.strerror(errno_num)
-> 1821 raise child_exception_type(errno_num, err_msg, err_filename)
1822 raise child_exception_type(err_msg)
1823
PermissionError: [Errno 13] Permission denied: './hmmer-3.3.1/src/'
l have tried to use 'chmod 777' but useless.
my trial of chmod 777
os.system('chmod 777 -R /home/studio-lab-user/hmmer-3.3.1/src/*')
JACKHMMER_BINARY_PATH = './hmmer-3.3.1/src/'
Anyone have an idea?

In python3 Tabula.read_pdf returns TypeError: expected str, bytes or os.PathLike object, not builtin_function_or_method. How do I make it work?

I am running my scraping project in Jupyter Notebooks on my server using python3. For some reason Tabula-py / Tabula errors when running Tabula.read_pdf and returns TypeError: expected str, bytes or os.PathLike object, not builtin_function_or_method. How do I make it work? I am passing actual PDF file.
My code that errors
import tabula
df = tabula.read_pdf("20200125-sitrep-5-2019-ncov.pdf", pages=all)
My error
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-20-4f86b7402956> in <module>
----> 1 df = tabula.read_pdf("20200125-sitrep-5-2019-ncov.pdf", pages=all)
/usr/local/lib/python3.7/dist-packages/tabula/io.py in read_pdf(input_path, output_format, encoding, java_options, pandas_options, multiple_tables, user_agent, **kwargs)
320
321 try:
--> 322 output = _run(java_options, kwargs, path, encoding)
323 finally:
324 if temporary:
/usr/local/lib/python3.7/dist-packages/tabula/io.py in _run(java_options, options, path, encoding)
83 stderr=subprocess.PIPE,
84 stdin=subprocess.DEVNULL,
---> 85 check=True,
86 )
87 if result.stderr:
/usr/lib/python3.7/subprocess.py in run(input, capture_output, timeout, check, *popenargs, **kwargs)
470 kwargs['stderr'] = PIPE
471
--> 472 with Popen(*popenargs, **kwargs) as process:
473 try:
474 stdout, stderr = process.communicate(input, timeout=timeout)
/usr/lib/python3.7/subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)
773 c2pread, c2pwrite,
774 errread, errwrite,
--> 775 restore_signals, start_new_session)
776 except:
777 # Cleanup if the child failed starting.
/usr/lib/python3.7/subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)
1451 errread, errwrite,
1452 errpipe_read, errpipe_write,
-> 1453 restore_signals, start_new_session, preexec_fn)
1454 self._child_created = True
1455 finally:
TypeError: expected str, bytes or os.PathLike object, not builtin_function_or_method
My PDF is named 20200125-sitrep-5-2019-ncov.pdf. This is the pdf that I scraped - https://www.who.int/docs/default-source/coronaviruse/situation-reports/20200125-sitrep-5-2019-ncov.pdf?sfvrsn=429b143d_8
Tabula does not seam to work on server or in vertual invironments, so I decided to use another library called Camelot.
Installed Camelot
pip install camelot-py
Import Camelot
import camelot
My new code
tables = camelot.read_pdf('20200125-sitrep-5-2019-ncov.pdf', pages='3', process_background=True)
tables.export('20200125-sitrep-5-2019-ncov.csv', f='csv', compress=True)
tables[0]
tables[0].parsing_report
{
'accuracy': 99.02,
'whitespace': 12.24,
'order': 1,
'page': 1
}
tables[0].to_csv('foo.csv') # to_json, to_excel, to_html
df_1 = tables[0].df # get a pandas DataFrame!
Documentation can be found here - https://camelot-py.readthedocs.io/en/master/user/quickstart.html
further reading https://camelot-py.readthedocs.io/en/master/user/advanced.html#advanced
Your
pages=all
should be
pages = "all"
tabula.read_pdf expects it's arguments as strings.
This is the reason why you are seeing
expected str, bytes or os.PathLike object, not builtin_function_or_method

FileNotFoundError: [WinError 2] The system cannot find the file specified: exiftool

I am trying to extract the metadata from an mp4/jpg file. I am using exiftool, but if there is something better out there please say. I would like to begin with a video, extract the frames as jpgs and add the metadata to each frame, there should be slight differences in each of the metadata for the images like time and maybe focal length.
Here is the start of my attempt with https://smarnach.github.io/pyexiftool/. I don't think its even loading as et, but I am new at this and do not know what could be the problem?
Here is the MWE (which is pretty much what is in the documentation) - it does the same whether I use .jpg or .mp4
import exiftool
files = ['file.MP4', 'file.MP4']
with exiftool.ExifTool() as et:
metadata = et.get_metadata_batch(files)
for d in metadata:
print("{:20.20} {:20.20}".format(d["SourceFile"],
d["EXIF:DateTimeOriginal"]))
and the error:
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-4-2bf611f4ab6b> in <module>
9 files = ['file.MP4', 'file1.MP4']
10
---> 11 with exiftool.ExifTool() as et:
12 metadata = et.get_metadata_batch(files)
13 for d in metadata:
C:\ProgramData\Anaconda3\lib\site-packages\exiftool.py in __enter__(self)
189
190 def __enter__(self):
--> 191 self.start()
192 return self
193
C:\ProgramData\Anaconda3\lib\site-packages\exiftool.py in start(self)
172 "-common_args", "-G", "-n"],
173 stdin=subprocess.PIPE, stdout=subprocess.PIPE,
--> 174 stderr=devnull)
175 self.running = True
176
C:\ProgramData\Anaconda3\lib\subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors)
727 c2pread, c2pwrite,
728 errread, errwrite,
--> 729 restore_signals, start_new_session)
730 except:
731 # Cleanup if the child failed starting.
C:\ProgramData\Anaconda3\lib\subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_start_new_session)
1015 env,
1016 os.fspath(cwd) if cwd is not None else None,
-> 1017 startupinfo)
1018 finally:
1019 # Child is launched. Close the parent's copy of those pipe
FileNotFoundError: [WinError 2] The system cannot find the file specified
The way you use pyexiftool requires that exiftool is available in a directory listed in the $PATH environment variable.
Open a cmd window, type in the command exiftool and hit enter. If that also returns a "file not found" error, then either
exiftool is not installed or
the directory where exiftool is installed is not in the path.
In case (2) you can give the full path to the exiftool executable in the constructor. For example:
exiftool.ExifTool(r'C:\program files\exiftool\exiftool.exe')

pyspark:FileNotFoundError: [WinError 2] The system cannot find the file specified

The program below causes the following error: pyspark:FileNotFoundError: [WinError 2] The system cannot find the file specified.
from pyspark import SparkContext
sc = SparkContext("local", "Local app")
words = sc.parallelize (
["scala",
"java",
"hadoop",
"spark",
"akka",
"spark vs hadoop",
"pyspark",
"pyspark and spark"]
)
words_filter = words.filter(lambda x: 'spark' in x)
filtered = words_filter.take(4)
print(filtered)
The full stacktrace:
FileNotFoundError Traceback (most recent call last)
<ipython-input-15-6c02343320b8> in <module>()
1 from pyspark import SparkContext
2 #sc = SparkSession.builder.master("local").appName("Word Count").config("spark.some.config.option", "some-value").getOrCreate()
----> 3 sc = SparkContext("local", "")
4
5 words = sc.parallelize (
C:\opt\spark\spark-2.3.0-bin-hadoop2.7\spark-2.3.0-bin-hadoop2.7\python\pyspark\context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
113 """
114 self._callsite = first_spark_call() or CallSite(None, None, None)
--> 115 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)`enter code here`
116 try:`enter code here`
117 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
C:\opt\spark\spark-2.3.0-bin-hadoop2.7\spark-2.3.0-bin-hadoop2.7\python\pyspark\context.py in _ensure_initialized(cls, instance, gateway, conf)
278 with SparkContext._lock:
279 if not SparkContext._gateway:
--> 280 SparkContext._gateway = gateway or launch_gateway(conf)
281 SparkContext._jvm = SparkContext._gateway.jvm
282
C:\opt\spark\spark-2.3.0-bin-hadoop2.7\spark-2.3.0-bin-hadoop2.7\python\pyspark\java_gateway.py in launch_gateway(conf)
78 else:
79 # preexec_fn not supported on Windows
---> 80 proc = Popen(command, stdin=PIPE, env=env)
81
82 gateway_port = None
C:\ProgramData\Anaconda3\lib\subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors)
707 c2pread, c2pwrite,
708 errread, errwrite,
--> 709 restore_signals, start_new_session)
710 except:
711 # Cleanup if the child failed starting.
C:\ProgramData\Anaconda3\lib\subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_start_new_session)
995 env,
996 os.fspath(cwd) if cwd is not None else None,
--> 997 startupinfo)
998 finally:
999 # Child is launched. Close the parent's copy of those pipe
FileNotFoundError: [WinError 2] The system cannot find the file specified
Not sure about this because I was not able to reproduce your error but by looking at java_gateway.py it might help to check the environment variable $SPARK_HOME and if the spark-submit script can be found under $SPARK_HOME
In Python:
import os
print(os.environ.get("SPARK_HOME"))
print(os.path.join(os.environ.get("SPARK_HOME"), './bin/spark-submit.cmd'))
I dont think your session is created correctly, try with this instead:
sc = SparkSession.builder \
.master('local[*]') \
.appName('your app name') \
.getOrCreate()

error message when using pydot to save image AttributeError: module 'os' has no attribute 'errno'

I am trying to use the following code segment to save a tree image generated from fitting a random forest model
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
tree = rf.estimators_[5]
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')
graph.write_png('tree.png')
But the graph.write_png('tree.png') generates the following error message, what might be the reason?
FileNotFoundError Traceback (most recent call last)
~\Anaconda3\envs\dsproject\lib\site-packages\pydot.py in create(self, prog, format, encoding)
1860 shell=False,
-> 1861 stderr=subprocess.PIPE, stdout=subprocess.PIPE)
1862 except OSError as e:
~\Anaconda3\envs\dsproject\lib\subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)
755 errread, errwrite,
--> 756 restore_signals, start_new_session)
757 except:
~\Anaconda3\envs\dsproject\lib\subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_start_new_session)
1154 os.fspath(cwd) if cwd is not None else None,
-> 1155 startupinfo)
1156 finally:
FileNotFoundError: [WinError 2] The system cannot find the file specified
During handling of the above exception, another exception occurred:
AttributeError Traceback (most recent call last)
<ipython-input-15-a132c22204b9> in <module>()
----> 1 graph.write_png('tree.png')
~\Anaconda3\envs\dsproject\lib\site-packages\pydot.py in new_method(path, f, prog, encoding)
1671 self.write(
1672 path, format=f, prog=prog,
-> 1673 encoding=encoding)
1674 name = 'write_{fmt}'.format(fmt=frmt)
1675 self.__setattr__(name, new_method)
~\Anaconda3\envs\dsproject\lib\site-packages\pydot.py in write(self, path, prog, format, encoding)
1754 f.write(s)
1755 else:
-> 1756 s = self.create(prog, format, encoding=encoding)
1757 with io.open(path, mode='wb') as f:
1758 f.write(s)
~\Anaconda3\envs\dsproject\lib\site-packages\pydot.py in create(self, prog, format, encoding)
1861 stderr=subprocess.PIPE, stdout=subprocess.PIPE)
1862 except OSError as e:
-> 1863 if e.errno == os.errno.ENOENT:
1864 args = list(e.args)
1865 args[1] = '"{prog}" not found in path.'.format(
AttributeError: module 'os' has no attribute 'errno'
In python 3.7 os.errno is no longer available, you can modify python3.7/site-packages/pydot.py to use import errno directly and line 1863 change to:
import errno
if e.errno == errno.ENOENT:

Resources