I am trying to import student data from an Excel workbook. I have to select column_name of the class StudentMasterResource dynamically which is present in the file. I got all column name in constants module has one dictionary which name column_name. When I do it for the first time, it works, then it fails
constants.py
column_name = dict()
resource.py
from common_account import constants
from import_export import widgets, fields, resources
def getClassName(key):
if key in constants.column_name:
return constants.column_name[key]
return key
class StudentMasterResource(resources.ModelResource):
organisation_id = fields.Field(
column_name=getClassName('organisation_id'),
attribute='organisation_id',
widget=widgets.ForeignKeyWidget(OrganisationMaster, 'organisation_name'),
saves_null_values=True
)
name = fields.Field(
column_name=getClassName('Name'),
attribute='name',
saves_null_values=True,
widget=widgets.CharWidget()
)
date_of_birth = fields.Field(
column_name=getClassName('date'),
attribute='date_of_birth',
saves_null_values=True,
widget=widgets.DateWidget()
)
views.py
from common_account import constants
from tablib import Dataset
#api_view(['POST'])
#permission_classes([IsAuthenticated])
def student_import(request):
if request.method == 'POST':
context_data = dict()
data_set = Dataset()
file = request.FILES['myfile']
extension = file.name.split(".")[-1].lower()
column_data = request.data
is_import = column_name['is_import']
constants.valid_data.clear()
constants.invalid_data.clear()
if extension == 'csv':
data = data_set.load(file.read().decode('utf-8'), format=extension)
else:
data = data_set.load(file.read(), format=extension)
constants.column_name = {
'date' : column_data.get('birth'),
'name' : column_data.get('name'),
}
if is_import == 'No':
result = student_resource.import_data(data_set, organisation_id = request.user.organisation_id,
offering_id = offering_id,all_invalid_data = False, dry_run=True, raise_errors=True)
context_data['valid_data'] = constants.valid_data
context_data['invalid_data'] = constants.invalid_data
context_data[constants.RESPONSE_RESULT] = {"Total records":student_resource.total_cnt,
"skip records":len(constants.invalid_data),
"Records imported": len(constants.valid_data),
}
return JsonResponse(context_data)
elif is_import == 'Yes':
result = student_resource.import_data(data_set, organisation_id = request.user.organisation_id,
offering_id = offering_id,all_invalid_data = False, dry_run=False, raise_errors=False)
context_data[constants.RESPONSE_ERROR] = False
context_data[constants.RESPONSE_MESSAGE] = 'Data Imported !!!'
context_data[constants.RESPONSE_RESULT] = {"Total records":student_resource.total_cnt,
"skip records":len(constants.invalid_data),
"Records imported": len(constants.valid_data),
}
return JsonResponse(context_data)
I'm creating a boto3 script that scrapes and uploads our entire accounts Public Ips and NatGateway Ips to our S3 bucket. I'm stuck on writing files for both returns. I would ideally like to write two separate files while still using the same filename variable you see in main(). Right now I can get this to work with only one return(either nat_ips or public_ips)
import boto3
from datetime import datetime
from csv import writer
def get_ips():
# Uses STS to assume the role needed.
boto_sts=boto3.client('sts')
sts_response = boto_sts.assume_role(
RoleArn='arn:aws:iam::1234:role/foo',
RoleSessionName='Foo'
)
# Save the details from assumed role into vars
sts_credentials = sts_response["Credentials"]
session_id = sts_credentials["AccessKeyId"]
session_key = sts_credentials["SecretAccessKey"]
session_token = sts_credentials["SessionToken"]
# List and store all the regions
ec2_client=boto3.client('ec2',aws_access_key_id=session_id,aws_secret_access_key=session_key,aws_session_token=session_token,region_name='us-west-1')
all_regions=[region['RegionName'] for region in ec2_client.describe_regions()['Regions']]
nat_ips = []
public_ips = []
for region in all_regions:
max_results = 1000
next_token = ''
ec2_client=boto3.client('ec2',aws_access_key_id=session_id,aws_secret_access_key=session_key,aws_session_token=session_token,region_name=region)
session=boto3.Session(aws_access_key_id=session_id, aws_secret_access_key=session_key, aws_session_token=session_token, region_name=region)
while next_token or next_token == '':
response = ec2_client.describe_nat_gateways(MaxResults=max_results, NextToken=next_token)
filters = [{'Name':'tag:Name', 'Values':['*sgw-eip']}]
get_ips = ec2_client.describe_addresses(Filters=filters)
for gateway in response["NatGateways"]:
for address in gateway["NatGatewayAddresses"]:
nat_ips.append(address["PublicIp"]+'/32')
for eip_dict in get_ips['Addresses']:
public_ip_string = eip_dict['Tags'][0]['Value'] + ' : ' + eip_dict['PublicIp']
public_ips.append(public_ip_string)
next_token = response.get("NextToken", None)
return nat_ips, public_ips
def _s3_upload(filename):
s3 = boto3.resource('s3')
bucket = 'foo-bar'
object_name = 'foo/'
s3.meta.client.upload_file(Filename=filename,Bucket=bucket,Key=object_name+filename)
print(f'Uploading {filename} to {bucket}')
def write_list_to_file(filename, data):
lines_string = '\n'.join(str(x) for x in data)
with open(filename,'w') as output:
output.writelines(lines_string)
print(f'Writing file to {filename}')
if __name__ == "__main__":
date = datetime.now().strftime('%Y%m%d')
# Stuck here since I want to make it one variable
filename_nat_ips = f'natgateway_ips{date}.csv'
filename_sga_ips = f'sga_ips{date}.csv'
public_ips = get_ips()
nat_ips = get_ips()
print(filename)
write_list_to_file(filename, nat_ips)
_s3_upload(filename)
I see that you are already returning a tuple of public_ips and nat_ips from your get_ips() function. So in your main, you could collect them together as well.
You might try something like this:
if __name__ == "__main__":
date = datetime.now().strftime('%Y%m%d')
# Stuck here since I want to make it one variable
filename_nat_ips = f'natgateway_ips{date}.csv'
filename_sga_ips = f'sga_ips{date}.csv'
nat_ips, public_ips = get_ips()
write_list_to_file(filename_nat_ips, nat_ips)
write_list_to_file(filename_public_ips, public_ips)
_s3_upload(filename_nat_ips)
_s3_upload(filename_public_ips)
I was doing it right the first time. And was trying to make it more complicated.
if __name__ == "__main__":
date = datetime.now().strftime('%Y%m%d')
filename_nat_ips = f'natgateway_ips{date}.csv'
filename_sga_ips = f'sga_ips{date}.csv'
nat_ips, public_ips = get_ips()
print(filename_nat_ips)
print(filename_sga_ips)
write_list_to_file(filename_nat_ips, nat_ips)
write_list_to_file(filename_sga_ips, public_ips)
_s3_upload(filename_nat_ips)
_s3_upload(filename_sga_ips)
google.api_core.exceptions.BadRequest: 400 Error while reading data, error message: CSV table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the error stream for more details.
I am trying to run Python script that loads the data into csv but getting this error. can anyone explain me this error
import csv
#Imports the Google Cloud BigQuery client library
from google.cloud import bigquery
from google.cloud.bigquery import Dataset
from google.cloud.bigquery import Table
from google.cloud.bigquery import LoadJobConfig
from google.cloud.bigquery import SchemaField
filename = 'events.csv'
idNeeded=0
#Instantiates a client
bigquery_client = bigquery.Client()
#Runs a query from BigQuery
def runBigQueryQuery( query, filename, idNeeded ):
if idNeeded == 1:
i = 1
query_job = bigquery_client.query(query)
results = query_job.result()
with open (filename, 'w', newline='') as f: #Create CSV file
write = csv.writer(f,dialect='excel',lineterminator='\n')
try:
for row in results:
print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{} '.format(row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance))
write.writerow([i,row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance]) #write Rows to CSV
i = i+1
except AttributeError as error:
print('An error occured: {0}'.format(error))
else:
query_job = bigquery_client.query(query)
results = query_job.result()
with open (filename, 'w', newline='') as f: #Create CSV file
write = csv.writer(f,dialect='excel',lineterminator='\n')
try:
for row in results:
print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{} '.format( row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance))
write.writerow([row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance]) #write Rows to CSV
except AttributeError as error:
print('An error occured: {0}'.format(error))
return
#Creates a dataset in BigQuery
def createDataset(datasetname):
dataset_ref = bigquery_client.dataset(datasetname)
dataset = Dataset(dataset_ref)
dataset.location = 'US'
dataset = bigquery_client.create_dataset(dataset)
return
def getDataset(datasetname):
dataset = bigquery_client.dataset(datasetname)
return dataset
def createTable(tablename, global_dataset_ref):
schema = [
#Enter Schema here.
# SchemaField('url', 'STRING', mode='required'),
# SchemaField('views', 'INTEGER', mode='required')
]
table_ref = global_dataset_ref.table(tablename)
table = Table(table_ref, schema=schema)
table = bigquery_client.create_table(table)
assert table.table_id == tablename
return
def getTable(tablename, global_dataset_ref):
table_ref = global_dataset_ref.table(tablename)
table = bigquery_client.get_table(table_ref)
# print(table.table_id)
print(table.schema)
# print(table.description)
# print(table.num_rows)
return table
def getTableSchema(tablename, global_dataset_ref):
table_ref = global_dataset_ref.table(tablename)
table = bigquery_client.get_table(table_ref)
schema = table.schema
return schema
def loadDataFromCSV(tablename, global_dataset_ref, filename):
schema = getTableSchema(tablename, global_dataset_ref)
table_ref = global_dataset_ref.table(tablename)
load_config = LoadJobConfig()
load_config.source_format = bigquery.SourceFormat.CSV
load_config.schema = schema
load_config.autodetect = True
load_config.allow_quoted_newlines = True
with open (filename, 'rb') as readable:
job = bigquery_client.load_table_from_file(readable, table_ref, location='US', job_config=load_config)
job.result()
print('Loaded {} rows into {}:{}.'.format(job.output_rows, global_dataset_ref, table_ref.table_id))
return
# Testing
if __name__ == '__main__':
datasetname = 'Data_Layer'
tablename = 'Events'
sqlquery = '''SELECT
null as EventId,
sc.scheduleid AS ScheduleId,
NULL AS Description,
sc.scheduledatefrom AS Date,
sc.timestart AS TimeFrom,
sc.timeduration AS TimeTo,
r.resourceid AS ResourceId,
sp.employeeid AS EmployeeId,
NULL AS MovementTypeId,
r.configheight AS Capacity,
CASE
WHEN st.schedulestatus IN (1, 3) THEN '1'
ELSE '0'
END CanBook,
CASE
WHEN sv.nonmembermayenroll = TRUE THEN '1'
ELSE '0'
END NonMemberFlag,
COALESCE(ProgramPrice.pricemember,
ServicePrice.pricemember,
0) AS MemberAmount,
COALESCE(ProgramPrice.pricenonmember,
ServicePrice.pricenonmember,
0) AS NonMemberAmount,
'N/A' AS Attendance
FROM
AloomaTest.SCSESSIONS s
LEFT JOIN
AloomaTest.SCSESSION_PROVIDERS sp
ON
sp.sessionid = s.sessionid
LEFT JOIN
AloomaTest.SCSESSION_RESOURCES sr
ON
sr.sessionid = s.sessionid
LEFT JOIN
AloomaTest.SCSCHEDULES sc
ON
sc.scheduleid = s.scheduleid
LEFT JOIN
AloomaTest._SCSCHEDULESTATUS ST
ON
ST.schedulestatus = sc.schedulestatus
LEFT JOIN
AloomaTest.SCRESOURCES r
ON
r.resourceid = sr.resourceid
LEFT JOIN
AloomaTest.SCSERVICES sv
ON
sv.serviceid = sc.serviceid
LEFT JOIN
AloomaTest.SCPROGREG_SEMCOURSES semc
ON
semc.serviceid = sc.serviceid
AND semc.semesterid = sc.semesterid
LEFT JOIN
AloomaTest.SCPROGREG_PRICES ProgramPrice
ON
ProgramPrice.scheduleid = sc.scheduleid
LEFT JOIN
AloomaTest.SCPROGREG_PRICES ServicePrice
ON
ServicePrice.semcourseid = semc.semcourseid
WHERE
COALESCE(ProgramPrice.feetypeid,
0) = 0
AND COALESCE(ServicePrice.feetypeid,
0)= 0
and sc.scheduleid in(31207,
25936,
5761094,
832794,
9825,
17912)
'''
#createDataset(datasetname) #Successfully tested this code 2018-09-24
global_dataset_ref = getDataset(datasetname) #Successfully tested this code 2018-09-24
#createTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24
getTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24
runBigQueryQuery(sqlquery,filename,idNeeded) #Successfully tested this code 2018-09-24
loadDataFromCSV(tablename, global_dataset_ref,filename) #Successfully tested this code 2018-09-24
sample data
,25936,2009-06-01 18:30:00,1110,M1PO - M1 PT Full,60,,254,,,1,0,0,0,N/A
,17912,2009-04-22 06:15:00,375,Pil Ptnr - Pilates Partner,60,47,398,,10,1,1,0,0,N/A
,31207,2009-06-22 19:00:00,1140,D390-2 - 1 1/2 Hour Massage,90,107,548,,20,0,0,0,0,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,583,2349,,20,0,1,20,50,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,591,2349,,20,0,1,20,50,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,585,2349,,20,0,1,20,50,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,584,2349,,20,0,1,20,50,N/A
,832794,2012-02-21 14:30:00,870,Comp Member One/One,60,,2963,,,1,0,0,0,N/A
The error message indicates that there is only 1 row in your CSV, you might be missing new lines while making it.
My first python project, I'm trying to scrape restaurant inspection. One site has summaries that offer keys to the detailed reports that I want to scrape. I'm stumped at looping through the keyed list of urls to get the details.
import pandas as pd
import bs4
import datetime
import re
import lxml
from urllib.request import urlopen
from urllib.error import HTTPError
try:
insp = pd.read_csv("ftp://dbprftp.state.fl.us/pub/llweb/5fdinspi.csv",
usecols=[2,14,18,80,81])
except IOError:
print("The file is not accessible.")
insp.columns = ["CountyName", "InspectDate",
"NumHighVio", "LicenseID", "VisitID"]
# filter for alachua county restaurants
alachua = insp[insp.CountyName == 'Alachua']
# filter for restaurants that had at least one serious violation
alachua = alachua[alachua.NumHighVio > 0]
# change date string to date object
alachua['InspectDate'] = pd.to_datetime(alachua['InspectDate'])
# sort most recent
alachua = alachua.sort_values('InspectDate', ascending=False)
# prefer to have user set timedelta below:
today = pd.to_datetime('today')
startDay = datetime.date.today() - datetime.timedelta(days=30)
alachua = alachua[(alachua['InspectDate'] > startDay) &
(alachua['InspectDate'] < today)]
# takes LicenseID and VisitID, passes it into the urls for detailed reports
for index, rows in alachua.iterrows():
visitID = rows['VisitID']
licID = rows['LicenseID']
urls = "https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=
%s &licid= %s" % (visitID, licID)
urls = urls.replace(' ', '')
print(urls)
## here's my problem:
for url in urls:
def get_inspect_detail():
html = urlopen(url)
soup = bs4.BeautifulSoup(html.read(), 'lxml')
details = soup.find_all('font', {'face':'verdana'})[10:]
for detail in details:
siteName = details[0].text
licNum = details[2].text
siteRank = details[4].text
expDate = details[6].text
primeStatus = details[8].text
secStatus = details[10].text
siteAddress = details[12].text
inspectResult = details[20].text
observed1 = details[34].get_text
observed2 = details[36].text
observed3 = details[38].text
observed4 = details[40].text
observed5 = details[42].text
observed6 = details[44].text
observed7 = details[46].text
observed8 = details[48].text
observed9 = details[50].text
observed10 = details[52].text
detailsLib = {
'Restaurant': siteName,
'License': licNum,
'Rank': siteRank,
'Expires': expDate,
'Primary': primeStatus,
'Secondary': secStatus,
'Address': siteAddress,
'Result': inspectResult,
'Observed1': observed1,
'Observed2': observed2,
'Observed3': observed3,
'Observed4': observed4,
'Observed5': observed5,
'Observed6': observed6,
'Observed7': observed7,
'Observed8': observed8,
'Observed9': observed9,
'Observed10': observed10
}
repr(get_inspect_detail())
Probably an obvious mistake or lack of knowledge, but I can get the unscrubbed data for one url, but not for all.
I dont see a reason to define your function inside the loop. You would end up with a lot of redundant definitions this way. Second, you could just define a result list and accumulate the detailsLib objects inside it.
def get_inspect_detail(url):
html = urlopen(url)
soup = bs4.BeautifulSoup(html.read(), 'lxml')
details = soup.find_all('font', {'face': 'verdana'})[10:]
result = []
for detail in details:
siteName = details[0].text
licNum = details[2].text
siteRank = details[4].text
expDate = details[6].text
primeStatus = details[8].text
secStatus = details[10].text
siteAddress = details[12].text
inspectResult = details[20].text
observed1 = details[34].get_text
observed2 = details[36].text
observed3 = details[38].text
observed4 = details[40].text
observed5 = details[42].text
observed6 = details[44].text
observed7 = details[46].text
observed8 = details[48].text
observed9 = details[50].text
observed10 = details[52].text
detailsLib = {
'Restaurant': siteName,
'License': licNum,
'Rank': siteRank,
'Expires': expDate,
'Primary': primeStatus,
'Secondary': secStatus,
'Address': siteAddress,
'Result': inspectResult,
'Observed1': observed1,
'Observed2': observed2,
'Observed3': observed3,
'Observed4': observed4,
'Observed5': observed5,
'Observed6': observed6,
'Observed7': observed7,
'Observed8': observed8,
'Observed9': observed9,
'Observed10': observed10
}
result.append(detailsLib)
return result
for url in urls:
repr(get_inspect_detail(url))