I started having strange issue when i try to read csv from gcp bucket and write to the same bucket.
Please note that the code below used to work for me before but now an exception is thrown in airflow logs saying
{models.py:1796} ERROR - Error executing an HTTP request: libcurl code 23 meaning 'Failed writing received data to disk/application', error details: Received 134221820 response bytes for a 134217728-byte buffe
when reading gs://file_bucket/abc.csv
Traceback (most recent call last)
File "/usr/local/lib/airflow/airflow/models.py", line 1664, in _run_raw_tas
result = task_copy.execute(context=context
File "/usr/local/lib/airflow/airflow/operators/python_operator.py", line 103, in execut
return_value = self.execute_callable(
File "/usr/local/lib/airflow/airflow/operators/python_operator.py", line 108, in execute_callabl
return self.python_callable(*self.op_args, **self.op_kwargs
File "/home/airflow/gcs/dags/handle_split_rows.py", line 56, in handle_split_row
lines= file_stream.read(
File "/opt/python3.6/lib/python3.6/site-packages/tensorflow/python/lib/io/file_io.py", line 132, in rea
pywrap_tensorflow.ReadFromStream(self._read_buf, length, status)
File "/opt/python3.6/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py", line 528, in __exit_
c_api.TF_GetCode(self.status.status)
tensorflow.python.framework.errors_impl.FailedPreconditionError: Error executing an HTTP request: libcurl code 23 meaning 'Failed writing received data to disk/application', error details: Received 134221820 response bytes for a 134217728-byte buffe
when reading gs://file_bucket/abc.csv
code:
#!/usr/bin/env python
import os
import json
from datetime import datetime, timedelta
from airflow import DAG
from airflow.models import Variable
from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
from airflow.utils.trigger_rule import TriggerRule
from airflow.operators import python_operator
from airflow.contrib.hooks import gcs_hook
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from airflow.contrib.operators.bigquery_table_delete_operator import BigQueryTableDeleteOperator
from airflow.contrib.operators.bigquery_to_gcs import BigQueryToCloudStorageOperator
from airflow.contrib.operators.bigquery_operator import BigQueryCreateEmptyTableOperator
from airflow.contrib.operators.gcs_list_operator import GoogleCloudStorageListOperator
from airflow.operators.sensors import ExternalTaskSensor
from airflow.operators import PythonOperator, BranchPythonOperator
from airflow.operators import BashOperator
from lib import notification_utility
default_args = {
'owner': os.environ["OWNER"],
'depends_on_past': False,
'start_date': '2019-10-10 09:31:00'
}
with DAG('parse_bad_rows',
default_args=default_args,
catchup=False,
schedule_interval= None
) as dag:
def parse_rows(**context):
import pandas as pd
import numpy as np
import csv
import os
import gcsfs
from tensorflow.python.lib.io import file_io
from pandas.compat import StringIO
import io
#**tf.disable_v2_behavior() also tried disabling v1 just in case but i dont think it makes any sense**
#updated_file_list = context['ti'].xcom_pull(task_ids='list_files_delta_bucket_test')
fs = gcsfs.GCSFileSystem(project='project_name')
updated_file_list = fs.ls('/bucket_name/foldername/')
updated_file_list = [ x for x in updated_file_list if "abc" in x ]
print("updated_file_list------------------>",updated_file_list)
for f in updated_file_list:
print("File Being processed------->",f)
file_name = os.path.splitext(f)[0]
#**this is where the job is failing while reading the file so I am assuming it has to do something with tensorflow.python.lib.io import file_io**
file_stream = file_io.FileIO("gs://"+f, mode='r')
lines= file_stream.read()
file_stream_less_cols =io.StringIO(lines)
Split_Rows = [x for x in file_stream_less_cols if x.count('|') < 297]
Split_Rows = ' '.join(map(str, Split_Rows))
file_stream.close()
Split_Rows_Stream = pd.DataFrame(io.StringIO(Split_Rows),columns=['BLOB_COLUMN'],dtype='str')
#Split_Rows_Stream['File_Name'] = Split_Rows_Stream.index
parse_names = file_name.split('/')
filename = parse_names[2]
bucketname = parse_names[0]
Split_Rows_Stream['FILE_NAME'] = filename
print("bucketname------------>",bucketname)
print("filename------------->",filename)
Split_Rows_Stream.to_csv("gs://"+bucketname+"/ERROR_FILES/"+filename+".csv",encoding='utf-8',quoting=csv.QUOTE_NONE,escapechar='|')
Python_Task_Split_Rows = PythonOperator(
task_id= 'split_rows_to_process_test',
provide_context=True,
python_callable=parse_rows,
#op_kwargs={'project':'project_name','bucket':'bucket_name','table_name':'abc','delim_num':297},
#trigger_rule=TriggerRule.ALL_SUCCESS,
dag=dag
)
# Orchestration
Python_Task_Split_Rows
I also tried the same in local so as to make sure csv is not an issue.
import pandas as pd
import numpy as np
import csv
import io
import os
#Read the file
directory='c:\\Users\BG/Downloads/file_Cleansing'
for filename in os.listdir(directory):
file_name = filename.split('.')[0]
f=open('c:\\Users\BG/Downloads/file_Cleansing/'+filename,'r',encoding="utf8")
#Readlines forom the text file
lines= f.read()
#cleanse the lines
file_stream =io.StringIO(lines)
Split_Rows = [x for x in file_stream if x.count('|') < 297]
Split_Rows = ' '.join(map(str, Split_Rows))
f.close()
Split_Rows_Stream = pd.DataFrame(io.StringIO(Split_Rows),columns=['blob'])
Split_Rows_Stream["File_Name"] = file_name
Split_Rows_Stream.to_csv("c:\\Users\BG/Downloads/file_Cleansed/'+filename+"_error.csv",escapechar='|',encoding='utf-8')
the above worked as expected.
My goal is to find records that are not matching the number of delimiters expected for a row(basically my delim is pipe and there are 297 pipes expected per row as there are 298 columns in this csv but in some rows i have pipe in between data. )
and capture those records and load it into csv and then into a table in bigquery for concatenating back the rows(using sql lead or lag as i am using the filename and index number for ordering and grouping) to repair and recover the records as many as possible.
Also lastly my service account has changed can this be some permission issue on GCP.
any advise appreciated.
Thank you for the time.
It seems to be an issue related to [permissions][1], verify your service account is listed within the bucket permissions and if it has the role to read and/or write
I had replicated you scenario with your code to read the file and it works correctly
from tensorflow.python.lib.io import file_io
import gcsfs
import os, io
fs = gcsfs.GCSFileSystem(project='project_name')
updated_file_list = fs.ls('bucket')
updated_file_list = [ x for x in updated_file_list if "filename" in x ]
print("updated_file_list------------------>",updated_file_list)
for f in updated_file_list:
print("File Being processed------->",f)
file_name = os.path.splitext(f)[0]
#**this is where the job is failing while reading the file so I am assuming it has to do something with tensorflow.python.lib.io import file_io**
file_stream = file_io.FileIO("gs://"+f, mode='r')
lines= file_stream.read()
print(lines)
OUTPUT:
('updated_file_list------------------>', [u'bucket/file'])
('File Being processed------->', u'bucket/file')
this is a text from a file
Yes, this could be a permission issue on GCP. Could you also check the GCS logs, maybe you can get there more information about this problem.
Related
I want to use the PyPDF2 module to merge PDFs.
The following code works fine:
import PyPDF2
import sys
import os
input_path = r'\Users\XXXXX\OneDrive\Desktop\PDF_File_Input'
merger = PyPDF2.PdfFileMerger()
for file in os.listdir(input_path):
if file.endswith(".pdf"):
print(file)
As soon as I implement the append function I'm getting a traceback error from line 10. FileNotFoundError: [Errno 2] No such file or directory: 'abc.pdf'
import PyPDF2
import sys
import os
input_path = r'\Users\XXXXX\OneDrive\Desktop\PDF_File_Input'
merger = PyPDF2.PdfFileMerger()
for file in os.listdir(input_path):
if file.endswith(".pdf"):
merger.append(file)
merger.write("combined_file.pdf")
I don't understand why the file can be found via print but not by the append function.
If the path is correct, then most likely has something to do with the way you implement the Merger.
def run_mergepdf(args):
check_required(args, ["input", "output"])
print("Number of input files: {0}".format(len(args.input)))
#Preliminary checks
print("Checking read/write status")
check_files(args.input, action="r")
check_files([args.output], action="w")
#Join pdfs
print("Starting to merge PDFs")
merger = PdfFileMerger(strict=False)
for pdf in args.input:
if os.stat(pdf).st_size != 0:
merger.append(PdfFileReader(pdf))
print("Writing merged file: {0}".format(args.output))
merger.write(args.output)
print("PDFs merged successfully!")
I'm trying to open files from this URL: https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/California/
It's telling me it's not a zip file. I've searched other cases, but the situations are so varied it's hard to determine the course of action.
The following is my code:
import pandas as pd
import os
import numpy as np
from zipfile import ZipFile
from urllib.request import urlopen
zipurl = 'https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/California/'
zipresp = urlopen(zipurl)
tempzip = open("C:/Temp/tempfile.zip", "wb")
tempzip.write(zipresp.read())
tempzip.close()
zf = ZipFile("C:/Temp/tempfile.zip")
zf.extractall(path = save_path)
zf.close()
I get the following error:
BadZipFile: File is not a zip file
Thanks much.
My Python version is 3.7 and my Spyder Version is 3.3.6.
Using Python3 I'm trying to read the newest csv file using pandas in the path and directory specified; but I'm receiving "NotADirectoryError" on "latest_file = os.listdir(latest_date)[-1].
import pandas as pd
import os
#naming path for most recent file
path='/filepath/'
#specifying full path
latest_date = path+os.listdir(path)[-1]+'/'
#identifying newest file in directory
latest_file = os.listdir(latest_date)[-1]
csv_to_read=pd.read_csv(latest_date+latest_file)
display(csv_to_read)
There is no need to do any awkward slicing, see here.
import glob
import os
import pandas as pd
list_of_files = glob.glob('/filepath/*.csv')
To consider the possibility of not having a csv file in the directory:
if list_of_files:
latest_file = max(list_of_files, key=os.path.getctime)
csv_to_read = pd.read_csv(latest_file)
I am a beginner user of Python.
I am writing a Python script using "replace contractions" to replace contractions in all text files in the same directory with the expanded words, and then outputting the replaced files to another directory.
The code looks like the following at present:
import re, string, unicodedata
import nltk
import contractions
import inflect
import os
txt_files = [f for f in os.listdir('./test') if f.endswith('.txt')]
fd = open(txt_files)
with open(txt_files)as fd:
fd.method
fd.close()
def replace_contractions(text):
"""Replace contractions in string of text"""
return contractions.fix(text)
output_strings = map(replace_contractions, txt_files)
output_content = "".join(sorted(output_strings)) # sort join the output strings without separators
# write to file
with open(folder_path + output_filename, 'wt') as outfile:
outfile.write(output_content)
The error which I have received is:
"Traceback (most recent call last):
File "C:\Users\User\Desktop\Text Preprocessing.py", line 9, in <module>
fd = open(txt_files)
TypeError: invalid file: ['1.txt', '2.txt']"
Can anyone advise me on resolving the error? Thank you!
I have now edited my code to the following:
import re, string, unicodedata
import nltk
import contractions
import inflect
import os
txt_files = [f for f in os.listdir('./test') if f.endswith('.txt')]
import glob
for each_file in glob.glob("arc\.\d+\.txt"):
print(each_file)
def replace_contractions(text):
"""Replace contractions in string of text"""
return contractions.fix(text)
output_strings = map(replace_contractions, txt_files)
output_content = "".join(sorted(output_strings)) # sort join the output strings without separators
# write to file
folder_path = 'C:\\Users\\User\\Desktop\\test1\\'
output_filename = os.path.join(folder_path, '.txt')
with open(output_filename, 'wt') as outfile:
outfile.write(output_content)
There is no error. But I have 2 output files. The first is a text file with the string "1.txt2.txt" inside the text file and the second file has a filename as an underscore and is without any extension. I am not getting the desired output in the txt files, i.e. to expand the contractions in the text inside the txt files. Can anyone help out?
I have images in the same directory with a python file, i am trying to loop over the images and convert them into base64 but am getting this error.
Am using Ubuntu 14.0.4
Traceback (most recent call last):
File "convert_to_base64.py", line 33, in <module>
print(main())
File "convert_to_base64.py", line 26, in main
convert_to_base64()
File "convert_to_base64.py", line 19, in convert_to_base64
with open("*.jpg", "rb") as f:
IOError: [Errno 2] No such file or directory: '*.jpg'
Here is my python code
# -*- coding: utf-8 -*-
import os
import sys
import xlrd
import base64
import urllib
from datetime import datetime
reload(sys) # to re-enable sys.setdefaultencoding()
sys.setdefaultencoding('utf-8')
def convert_to_base64():
"""
Read all jpg images in a folder,
and print them in base64
"""
with open("*.jpg", "rb") as f:
data = base64.b64decode(f.read())
print data
def main():
start_datetime = datetime.now()
convert_to_base64()
end_datetime = datetime.now()
print '------------------------------------------------------'
print 'Script started : {}'.format(start_datetime)
print 'Script finished: {}'.format(end_datetime)
if __name__ == '__main__':
print(main())
print('Done')
someone help me figure out what am doing wrong.
Thanks
This is how I looped for images in a directory:
import os
pictures = []
for file in os.listdir("pictures"):
if file[-3:].lower() in ["png"]:
pictures.append(file)
Please refer to Python documentation https://docs.python.org/2/tutorial/inputoutput.html for more info on open() function:
open() returns a file object, and is most commonly used with two arguments: open(filename, mode).