I am trying to upload a set of csv data into a BigQuery from a BytesIO object, but keep getting an error InvalidResponse: Response headers must contain header 'location'
Here is my code
# self.database = authenticated bigquery.Client
config = bigquery.LoadJobConfig()
config.skip_leading_rows = 1
config.source_format = bigquery.SourceFormat.CSV
config.allow_jagged_rows = True
schema = [
bigquery.SchemaField("date", "DATE", mode="REQUIRED"),
bigquery.SchemaField("page_id", "STRING", mode="REQUIRED")
]
# ... Appending a list of bigquery.SchemaField("name", "INTEGER")
config.schema = schema
table = self.get_or_create_table(name, config.schema) # returns TableReference
file = self.clip_data(local_fp, cutoff_date) # returns BytesIO
job = self.database.load_table_from_file(
file, table,
num_retries=self.options.num_retries,
job_id=uuid.uuid4().int,
job_config=config
) # Error is here.
I have tried searching around but I cannot find any reason or fix for this exception.
InvalidResponse: ('Response headers must contain header', 'location')
The problem was caused by not providing a location in the load_table_from_file method.
location="US"
was enough to fix the problem.
Related
I'm calling a simple python function in google cloud but cannot get it to save. It shows this error:
"Function failed on loading user code. This is likely due to a bug in the user code. Error message: Error: please examine your function logs to see the error cause: https://cloud.google.com/functions/docs/monitoring/logging#viewing_logs. Additional troubleshooting documentation can be found at https://cloud.google.com/functions/docs/troubleshooting#logging. Please visit https://cloud.google.com/functions/docs/troubleshooting for in-depth troubleshooting documentation."
Logs don't seem to show much that would indicate error in the code. I followed this guide: https://blog.thereportapi.com/automate-a-daily-etl-of-currency-rates-into-bigquery/
With the only difference environment variables and the endpoint I'm using.
Code is below, which is just a get request followed by a push of data into a table.
import requests
import json
import time;
import os;
from google.cloud import bigquery
# Set any default values for these variables if they are not found from Environment variables
PROJECT_ID = os.environ.get("PROJECT_ID", "xxxxxxxxxxxxxx")
EXCHANGERATESAPI_KEY = os.environ.get("EXCHANGERATESAPI_KEY", "xxxxxxxxxxxxxxx")
REGIONAL_ENDPOINT = os.environ.get("REGIONAL_ENDPOINT", "europe-west1")
DATASET_ID = os.environ.get("DATASET_ID", "currency_rates")
TABLE_NAME = os.environ.get("TABLE_NAME", "currency_rates")
BASE_CURRENCY = os.environ.get("BASE_CURRENCY", "SEK")
SYMBOLS = os.environ.get("SYMBOLS", "NOK,EUR,USD,GBP")
def hello_world(request):
latest_response = get_latest_currency_rates();
write_to_bq(latest_response)
return "Success"
def get_latest_currency_rates():
PARAMS={'access_key': EXCHANGERATESAPI_KEY , 'symbols': SYMBOLS, 'base': BASE_CURRENCY}
response = requests.get("https://api.exchangeratesapi.io/v1/latest", params=PARAMS)
print(response.json())
return response.json()
def write_to_bq(response):
# Instantiates a client
bigquery_client = bigquery.Client(project=PROJECT_ID)
# Prepares a reference to the dataset
dataset_ref = bigquery_client.dataset(DATASET_ID)
table_ref = dataset_ref.table(TABLE_NAME)
table = bigquery_client.get_table(table_ref)
# get the current timestamp so we know how fresh the data is
timestamp = time.time()
jsondump = json.dumps(response) #Returns a string
# Ensure the Response is a String not JSON
rows_to_insert = [{"timestamp":timestamp,"data":jsondump}]
errors = bigquery_client.insert_rows(table, rows_to_insert) # API request
print(errors)
assert errors == []
I tried just the part that does the get request with an offline editor and I can confirm a response works fine. I suspect it might have to do something with permissions or the way the script tries to access the database.
I have developed below code which is helping to export BigQuery table in to Google storage bucket. I want to merge files into single file with out header, so that next processes will use file with out any issue.
def export_bq_table_to_gcs(self, table_name):
client = bigquery.Client(project=project_name)
print("Exporting table {}".format(table_name))
dataset_ref = client.dataset(dataset_name,
project=project_name)
dataset = bigquery.Dataset(dataset_ref)
table_ref = dataset.table(table_name)
size_bytes = client.get_table(table_ref).num_bytes
# For tables bigger than 1GB uses Google auto split, otherwise export is forced in a single file.
if size_bytes > 10 ** 9:
destination_uris = [
'gs://{}/{}{}*.csv'.format(bucket_name,
f'{table_name}_temp', uid)]
else:
destination_uris = [
'gs://{}/{}{}.csv'.format(bucket_name,
f'{table_name}_temp', uid)]
extract_job = client.extract_table(table_ref, destination_uris) # API request
result = extract_job.result() # Waits for job to complete.
if result.state != 'DONE' or result.errors:
raise Exception('Failed extract job {} for table {}'.format(result.job_id, table_name))
else:
print('BQ table(s) export completed successfully')
storage_client = storage.Client(project=gs_project_name)
bucket = storage_client.get_bucket(gs_bucket_name)
blob_list = bucket.list_blobs(prefix=f'{table_name}_temp')
print('Merging shard files into single file')
bucket.blob(f'{table_name}.csv').compose(blob_list)
Can you please help me to find a way to skip header.
Thanks,
Raghunath.
We can avoid header by using jobConfig to set the print_header parameter to False. Sample code
job_config = bigquery.job.ExtractJobConfig(print_header=False)
extract_job = client.extract_table(table_ref, destination_uris,
job_config=job_config)
Thanks
You can use skipLeadingRows (https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#externalDataConfiguration.googleSheetsOptions.skipLeadingRows)
I'm using python 3 to write a script that generates a customer report for Solarwinds N-Central. The script uses SOAP to query N-Central and I'm using zeep for this project. While not new to python I am new to SOAP.
When calling the CustomerList fuction I'm getting the TypeError: __init__() got an unexpected keyword argument 'listSOs'
import zeep
wsdl = 'http://' + <server url> + '/dms/services/ServerEI?wsdl'
client = zeep.CachingClient(wsdl=wsdl)
config = {'listSOs': 'true'}
customers = client.service.CustomerList(Username=nc_user, Password=nc_pass, Settings=config)
Per the perameters below 'listSOs' is not only a valid keyword, its the only one accepted.
CustomerList
public com.nable.nobj.ei.Customer[] CustomerList(String username, String password, com.nable.nobj.ei.T_KeyPair[] settings) throws RemoteException
Parameters:
username - MSP N-central username
password - Corresponding MSP N-central password
settings - A list of non default settings stored in a T_KeyPair[]. Below is a list of the acceptable Keys and Values. If not used leave null
(Key) listSOs - (Value) "true" or "false". If true only SOs with be shown, if false only customers and sites will be shown. Default value is false.
I've also tried passing the dictionary as part of a list:
config = []
key = {'listSOs': 'true'}
config += key
TypeError: Any element received object of type 'str', expected lxml.etree._Element or builtins.dict or zeep.objects.T_KeyPair
Omitting the Settings value entirely:
customers = client.service.CustomerList(Username=nc_user, Password=nc_pass)
zeep.exceptions.ValidationError: Missing element Settings (CustomerList.Settings)
And trying zeep's SkipValue:
customers = client.service.CustomerList(Username=nc_user, Password=nc_pass, Settings=zeep.xsd.SkipValue)
zeep.exceptions.Fault: java.lang.NullPointerException
I'm probably missing something simple but I've been banging my head against the wall off and on this for awhile I'm hoping someone can point me in the right direction.
Here's my source code from my getAssets.py script. I did it in Python2.7, easily upgradeable though. Hope it helps someone else, N-central's API documentation is really bad lol.
#pip2.7 install zeep
import zeep, sys, csv, copy
from zeep import helpers
api_username = 'your_ncentral_api_user'
api_password='your_ncentral_api_user_pw'
wsdl = 'https://(yourdomain|tenant)/dms2/services2/ServerEI2?wsdl'
client = zeep.CachingClient(wsdl=wsdl)
response = client.service.deviceList(
username=api_username,
password=api_password,
settings=
{
'key': 'customerId',
'value': 1
}
)
# If you can't tell yet, I code sloppy
devices_list = []
device_dict = {}
dev_inc = 0
max_dict_keys = 0
final_keys = []
for device in response:
# Iterate through all device nodes
for device_properties in device.items:
# Iterate through each device's properties and add it to a dict (keyed array)
device_dict[device_properties.first]=device_properties.second
# Dig further into device properties
device_properties = client.service.devicePropertyList(
username=api_username,
password=api_password,
deviceIDs=device_dict['device.deviceid'],
reverseOrder=False
)
prop_ind = 0 # This is a hacky thing I did to make my CSV writing work
for device_node in device_properties:
for prop_tree in device_node.properties:
for key, value in helpers.serialize_object(prop_tree).items():
prop_ind+=1
device_dict["prop" + str(prop_ind) + "_" + str(key)]=str(value)
# Append the dict to a list (array), giving us a multi dimensional array, you need to do deep copy, as .copy will act like a pointer
devices_list.append(copy.deepcopy(device_dict))
# check to see the amount of keys in the last item
if len(devices_list[-1].keys()) > max_dict_keys:
max_dict_keys = len(devices_list[-1].keys())
final_keys = devices_list[-1].keys()
print "Gathered all the datas of N-central devices count: ",len(devices_list)
# Write the data out to a CSV
with open('output.csv', 'w') as csvfile:
fieldnames = final_keys
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for csv_line in devices_list:
writer.writerow(csv_line)
My avsc file is as follows:
{"type":"record",
"namespace":"testing.avro",
"name":"product",
"aliases":["items","services","plans","deliverables"],
"fields":
[
{"name":"id", "type":"string" ,"aliases":["productid","itemid","item","product"]},
{"name":"brand", "type":"string","doc":"The brand associated", "default":"-1"},
{"name":"category","type":{"type":"map","values":"string"},"doc":"the list of categoryId, categoryName associated, send Id as key, name as value" },
{"name":"keywords", "type":{"type":"array","items":"string"},"doc":"this helps in long run in long run analysis, send the search keywords used for product"},
{"name":"groupid", "type":["string","null"],"doc":"Use this to represent or flag value of group to which it belong, e.g. it may be variation of same product"},
{"name":"price", "type":"double","aliases":["cost","unitprice"]},
{"name":"unit", "type":"string", "default":"Each"},
{"name":"unittype", "type":"string","aliases":["UOM"], "default":"Each"},
{"name":"url", "type":["string","null"],"doc":"URL of the product to return for more details on product, this will be used for event analysis. Provide full url"},
{"name":"imageurl","type":["string","null"],"doc":"Image url to display for return values"},
{"name":"updatedtime", "type":"string"},
{"name":"currency","type":"string", "default":"INR"},
{"name":"image", "type":["bytes","null"] , "doc":"fallback in case we cant provide the image url, use this judiciously and limit size"},
{"name":"features","type":{"type":"map","values":"string"},"doc":"Pass your classification attributes as features in key-value pair"}
]}
I am able to parse this but when I try to write on this as follows, I keep getting issue. What am I missing ? This is in python3. I verified it is well formated json, too.
from avro import schema as sc
from avro import datafile as df
from avro import io as avio
import os
_prodschema = 'product.avsc'
_namespace = 'testing.avro'
dirname = os.path.dirname(__file__)
avroschemaname = os.path.join( os.path.dirname(__file__),_prodschema)
sch = {}
with open(avroschemaname,'r') as f:
sch= f.read().encode(encoding='utf-8')
f.close()
proschema = sc.Parse(sch)
print("Schema processed")
writer = df.DataFileWriter(open(os.path.join(dirname,"products.json"),'wb'),
avio.DatumWriter(),proschema)
print("Just about to append the json")
writer.append({ "id":"23232",
"brand":"Relaxo",
"category":[{"123":"shoe","122":"accessories"}],
"keywords":["relaxo","shoe"],
"groupid":"",
"price":"799.99",
"unit":"Each",
"unittype":"Each",
"url":"",
"imageurl":"",
"updatedtime": "03/23/2017",
"currency":"INR",
"image":"",
"features":[{"color":"black","size":"10","style":"contemperory"}]
})
writer.close()
What am I missing here ?
I have a list of warc records. Every single item in list is created like this:
header = warc.WARCHeader({
"WARC-Type": "response",
"WARC-Target-URI": "www.somelink.com",
}, defaults=True)
data = "Some string"
record = warc.WARCRecord(header, data.encode('utf-8','replace'))
Now, I am using *.warc.gz to store my records like this:
output_file = warc.open("my_file.warc.gz", 'wb')
And write records like this:
output_file.write_record(record) # type of record is WARCRecord
But how can I compress with lzma as *.warc.xz? I have tried replacing gz with xz when callig warc.open, but warc in python3 do not support this format. I have found this trial, but I was not able to save WARCRecord with this:
output_file = lzma.open("my_file.warc.xz", 'ab', preset=9)
header = warc.WARCHeader({
"WARC-Type": "response",
"WARC-Target-URI": "www.somelink.com",
}, defaults=True)
data = "Some string"
record = warc.WARCRecord(header, data.encode('utf-8','replace'))
output_file.write(record)
The error message is:
TypeError: a bytes-like object is required, not 'WARCRecord'
Thanks for any help.
The WARCRecord class has a write_to method, to write records to a file object.
You could use that to write records to a file created with lzma.open().