Store data in a correct format after scraping in Python, post processing the csv data - python-3.x

Scraping is done now i have to postprocess the data stored in a csv file,
import pandas as pd
import uuid
def postprocess():
clean= ['Home','Shri','Smt','Dr','Mr','Mrs','Ms','Contact','Facebook','account','photo','Of','Biography','Email','Address','Roles','To','Read','more','Blog','Vacancies','Advertisement','Advertise','Holding','Second','Estimates','Fax','Find','Close','Links','Key','Figures','Stats','Statistics','Household','Full','name','Parks','Open','Menu','Languages','Opinion','Education','Address','Latest','Activity','NA','Father','Husband']
banned = ['january','february','march','april','may','june','july','september','october','november','december','monday','tuesday','wednesday','thursday','friday','saturday','sunday']
try:
df = pd.read_csv("result.csv",names=['Prefix','Name','Designation','Nation','Created_date_time'],encoding= 'unicode_escape')
except:
return
df = df.applymap(str)
df.replace(to_replace = 'nan', value = '', inplace = True)
print("Postprocessing...")
df.dropna(subset=['Name'],inplace=True)
df.drop_duplicates(subset=['Name'], keep='first',inplace=True)
df.reset_index(drop=True, inplace=True)
for count,_ in enumerate(df['Name']):
if len(df['Name'][count])<4:
df.drop(count,axis=0,inplace=True)
continue
if df['Prefix'][count].isnumeric() or len(df['Prefix'][count])>15:
df.at[count, 'Prefix'] = ''
if 'Home' in df['Prefix'][count]:
df.at[count, 'Prefix'] = df['Name'][count].replace('Home','')
for item in df['Name'][count].split(' '):
if len(item)<2:
df.at[count, 'Name'] = df['Name'][count].replace(item,'')
for word in clean:
if word in item:
new_val = df['Name'][count].replace(word,'').strip()
df.at[count, 'Name'] = new_val
for ban in banned:
if ban in item.lower():
df.drop(count,axis=0,inplace=True)
break
else:
continue
break
df.dropna(subset=['Name'],inplace=True)
df.drop_duplicates(subset=['Name'], keep='first',inplace=True)
df.reset_index(drop=True, inplace=True)
uuids = []
for _ in range(len(df)):
uuids.append(uuid.uuid4().hex)
df.insert(0, '_id', uuids)
print("Postprocessing complete. Database contains {} rows".format(len(df)))
df.to_csv("result.csv",index=False)
I am not getting the require result
What i am getting is:
id Prefix Name Designation Nation Created_date_time
random id font size present(some random words) India this is correct
I am not getting prefix, name is not correct, in place of designation country is showing and in place of nation date and time is showing
what i actually want is:
id Prefix Name Designation Nation Created_date_time
random id Mr Ankur Singh Analyst India date and time
Is there any suggestion for this... what things i need to correct or what can i do

Related

How to bulk create or update in Django

I have to process an item report CSV file every 1 hour. The CSV contains 150k+ records for 1 account and there are multiple accounts in my system. I was working previously on rails and there was active record gem to handle this use case very efficiently. I am looking for an alternate to this gem in Django or any built in method that will be helpful to import such large data in bulk.
So far I have tried this code.
class ItemReportService:
def call(self, file_url):
with open(file_url, 'r') as file:
reader = csv.DictReader(file)
products = []
for row in reader:
product = self.process_product(row)
products.append(product)
self.update_products(products)
def process_product(self, row):
print(f'Processing sku: {row["SKU"]}')
product = Product.objects.filter(
sku=row['SKU']).first() or Product(sku=row['SKU'])
product.listing_title = row['Product Name']
product.listed_price = row['Price']
product.buy_box_price = row['Buy Box Item Price'] + \
row['Buy Box Shipping Price']
product.status = row['Lifecycle Status']
return product
def update_products(self, products):
Product.objects.bulk_update(
products,
[
'listing_title',
'listed_price',
'buy_box_price',
'Lifecycle Status'
]
)
It is raising this exception because when there is a new product it doesn't have primary key assigned to it
ValueError: All bulk_update() objects must have a primary key set.
Django 4.1 has new parameters for bulk_create(update_conflicts=bool and update_fields=[])
If your model has a field UNIQUE usually Django would ignore it when creating new data. But if you set the update_conflicts parameter to True, the fields inside update_fields will be updated.
You are not saving the product in the database before applying bulk_update.
I have checked your code for this purpose, you can use bulk_insert with an additional parameter
Model.objects.bulk_create(self.data, ignore_conflicts=True)
or
columns = ['column1', 'column2']
obj = Model.objects.filter(column1="sku").first()
if not obj:
obj = Model.objects.create(column1="sku")
obj.column1 = row["column1"] or obj.column1
obj.column2 = row["column2"] or obj.column2
items_to_be_inserted.append(obj)
In the end, you can do bulk update like
Model.objects.bulk_update(items_to_be_inserted, columns)
This will solve your problem.
I made this class function which can be used on any Django model in a project.
from django.db import models
class BaseModel(models.Model):
#classmethod
def bulk_create_or_update(
cls, uniques: list[str],
defaults: list[str],
data: list[dict]
):
# Get existing object list
data_dict, select = {}, None
for entry in data:
sub_entry, key = {}, ''
for uniq in uniques:
sub_entry[uniq] = entry[uniq]
key += str(entry[uniq])
data_dict[key] = entry
if not select:
select = models.Q(**sub_entry)
continue
select |= models.Q(**sub_entry)
records = cls.objects.filter(select).values('pk', *uniques)
existing = {}
for rec in records:
key = ''
for uniq in uniques:
key += str(rec[uniq])
existing[key] = rec
# Split new objects from existing ones
to_create, to_update = [], []
for key, entry in data_dict.items():
obj = cls(**entry)
if key not in existing:
to_create.append(obj)
continue
obj.pk = existing[key]['pk']
to_update.append(obj)
cls.objects.bulk_create(to_create, batch_size=1000)
cls.objects.bulk_update(to_create, defaults, batch_size=1000)
Let take an usage example
class Product(BaseModel)
price = models.IntegerField()
name = models.CharField(max_length=128, unique=True)
status = models.CharField(max_length=128)
if __name__ == '__main__':
data = [
{'price': 50, 'name': 'p1', 'status': 'New'},
{'price': 33, 'name': 'p2', 'status': 'Old'}
]
Product.bulk_create_or_update(uniques=['name'], defaults=['price', 'status'], data=data)
Any improvement suggestion of the code is welcome.

How to scrape a table and its links

What I want to do is to take thw following website
https://www.tdcj.texas.gov/death_row/dr_executed_offenders.html
view-source:https://www.tdcj.texas.gov/death_row/dr_executed_offenders.html
And pick the year of execution, enter the Last Statement Link, and retrieve the statement... perhaps I would be creating 2 dictionaries, both with the execution number as key.
Afterwards, I would classify the statements by length, besides "flagging" the refusals to give it or if it was just not given.
Finally, all would be compiled in a SQLite database, and I would display a graph that shows how many messages, clustered by type, have been given each year.
Beautiful Soup seems to be the path to follow, I'm already having troubles with just printing the year of execution... Of course, I'm not ultimately interested in printing the years of execution, but it seems like a good way of checking if at least my code is properly locating the tags I want.
tags = soup('td')
for tag in tags:
print(tag.get('href', None))
Why does the previous code only print None?
Thanks beforehand.
Use pandas to get and manipulate the table. The links are static and by that I mean they can be easily recreated with offender's first and last name.
Then, you can use requests and BeautifulSoup to scrape for offender's last statement, which are quite moving.
Here's how:
import requests
import pandas as pd
def clean(first_and_last_name: list) -> str:
name = "".join(first_and_last_name).replace(" ", "").lower()
return name.replace(", Jr.", "").replace(", Sr.", "").replace("'", "")
base_url = "https://www.tdcj.texas.gov/death_row"
response = requests.get(f"{base_url}/dr_executed_offenders.html")
df = pd.read_html(response.text, flavor="bs4")
df = pd.concat(df)
df.rename(columns={'Link': "Offender Information", "Link.1": "Last Statement URL"}, inplace=True)
df["Offender Information"] = df[
["Last Name", 'First Name']
].apply(lambda x: f"{base_url}/dr_info/{clean(x)}.html", axis=1)
df["Last Statement URL"] = df[
["Last Name", 'First Name']
].apply(lambda x: f"{base_url}/dr_info/{clean(x)}last.html", axis=1)
df.to_csv("offenders.csv", index=False)
This gets you:
EDIT:
I actually went ahead and added the code that fetches all offenders' last statements.
import random
import time
import pandas as pd
import requests
from lxml import html
base_url = "https://www.tdcj.texas.gov/death_row"
response = requests.get(f"{base_url}/dr_executed_offenders.html")
statement_xpath = '//*[#id="content_right"]/p[6]/text()'
def clean(first_and_last_name: list) -> str:
name = "".join(first_and_last_name).replace(" ", "").lower()
return name.replace(", Jr.", "").replace(", Sr.", "").replace("'", "")
def get_last_statement(statement_url: str) -> str:
page = requests.get(statement_url).text
statement = html.fromstring(page).xpath(statement_xpath)
text = next(iter(statement), "")
return " ".join(text.split())
df = pd.read_html(response.text, flavor="bs4")
df = pd.concat(df)
df.rename(
columns={'Link': "Offender Information", "Link.1": "Last Statement URL"},
inplace=True,
)
df["Offender Information"] = df[
["Last Name", 'First Name']
].apply(lambda x: f"{base_url}/dr_info/{clean(x)}.html", axis=1)
df["Last Statement URL"] = df[
["Last Name", 'First Name']
].apply(lambda x: f"{base_url}/dr_info/{clean(x)}last.html", axis=1)
offender_data = list(
zip(
df["First Name"],
df["Last Name"],
df["Last Statement URL"],
)
)
statements = []
for item in offender_data:
*names, url = item
print(f"Fetching statement for {' '.join(names)}...")
statements.append(get_last_statement(statement_url=url))
time.sleep(random.randint(1, 4))
df["Last Statement"] = statements
df.to_csv("offenders_data.csv", index=False)
This will take a couple of minutes because the code "sleeps" for anywhere between 1 to 4 seconds, more or less, so the server doesn't get abused.
Once this gets done, you'll end up with a .csv file with all offenders' data and their statements, if there was one.

How to import a specific Key with its values in a csv file using Python?

In my program, the user will enter a country name and my code will first whether or not the country name exists in the dictionary. If it exists, the code has to print the country name and its associated values in a csv file. However, I am unable to do the csv part. I have presented the dictionary and relevant code for this issue. Kindly let me know what the issue is.
import string
import re
import csv
data = {
"Pakistan": (0.57, 0.05, 0.79),
"India": (0.47, 0.12, 0.54),
"Bangladesh": (0.49, 0.17, 0.81)
}
csv_columns = ['Country Name','1997','1998','1999','2000','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010']
csv_file = "Emissions_subset.csv"
con_name = input("Write up to three comma-separated countries for which you want to extract data: ")
count = len(re.findall(r'\w+', con_name))
if count == 1:
con_check1 = con_name.split()[0]
if con_check1.lower() in map(str.lower, data.keys()):
with open(csv_file, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
writer.writeheader()
writer.writerow(list(data.keys()).index(con_check1))
print("Data successfully extracted for countries {} saved into file Emissions_subset.csv".format(con_check1.capitalize()))

Unable to retrieve data from frame

I am trying to retrieve specific data from data-frame with particular condition, but it show empty data frame. I am new to data science, trying to learn data science. Here is my code.
file = open('/home/jeet/files1/files/ch03/adult.data', 'r')
def chr_int(a):
if a.isdigit(): return int(a)
else: return 0
data = []
for line in file:
data1 = line.split(',')
if len(data1) == 15:
data.append([chr_int(data1[0]), data1[1],
chr_int(data1[2]), data1[3],
chr_int(data1[4]), data1[5],
data1[6], data1[7], data1[8],
data1[9], chr_int(data1[10]),
chr_int(data1[11]),
chr_int(data1[12]),
data1[13], data1[14]])
import pandas as pd
df = pd.DataFrame(data)
df.columns = ['age', 'type-employer', 'fnlwgt', 'education','education_num', 'marital','occupation', 'relationship','race','sex','capital_gain','capital_loss','hr_per_week','country','income']
ml = df[(df.sex == 'Male')] # here i retrive data who is male
ml1 = df[(df.sex == 'Male') & (df.income == '>50K\n')]
print(ml1.head()) # here i printing that data
fm =df[(df.sex == 'Female')]
fm1 = df [(df.sex == 'Female') & (df.income =='>50K\n')]
output:
Empty DataFrame
Columns: [age, type-employer, fnlwgt, education, education_num, marital, occupation, relationship, race, sex, capital_gain, capital_loss, hr_per_week, country, income]
Index: []
what's wrong with the code. why data frame is empty.
If you check the values carefully, you may see the problem:
print(df.income.unique())
>>> [' <=50K\n' ' >50K\n']
There are spaces in front of each values. So values should be either processed to get rid of these spaces, or the code should be modified like this:
ml1 = df[(df.sex == 'Male') & (df.income == ' >50K\n')]
fm1 = df [(df.sex == 'Female') & (df.income ==' <=50K\n')]

Assigning multiple values to dictionary keys from a file in Python 3

I'm fairly new to Python but I haven't found the answer to this particular problem.
I am writing a simple recommendation program and I need to have a dictionary where cuisine is a key and name of a restaurant is a value. There are a few instances where I have to split a string of a few cuisine names and make sure all other restaurants (values) which have the same cuisine get assigned to the same cuisine (key). Here's a part of a file:
Georgie Porgie
87%
$$$
Canadian, Pub Food
Queen St. Cafe
82%
$
Malaysian, Thai
Mexican Grill
85%
$$
Mexican
Deep Fried Everything
52%
$
Pub Food
so it's just the first and the last one with the same cuisine but there are more later in the file.
And here is my code:
def new(file):
file = "/.../Restaurants.txt"
d = {}
key = []
with open(file) as file:
lines = file.readlines()
for i in range(len(lines)):
if i % 5 == 0:
if "," not in lines[i + 3]:
d[lines[i + 3].strip()] = [lines[i].strip()]
else:
key += (lines[i + 3].strip().split(', '))
for j in key:
if j not in d:
d[j] = [lines[i].strip()]
else:
d[j].append(lines[i].strip())
return d
It gets all the keys and values printed but it doesn't assign two values to the same key where it should. Also, with this last 'else' statement, the second restaurant is assigned to the wrong key as a second value. This should not happen. I would appreciate any comments or help.
In the case when there is only one category you don't check if the key is in the dictionary. You should do this analogously as in the case of multiple categories and then it works fine.
I don't know why you have file as an argument when you have a file then overwritten.
Additionally you should make 'key' for each result, and not += (adding it to the existing 'key'
when you check if j is in dictionary, clean way is to check if j is in the keys (d.keys())
def new(file):
file = "/.../Restaurants.txt"
d = {}
key = []
with open(file) as file:
lines = file.readlines()
for i in range(len(lines)):
if i % 5 == 0:
if "," not in lines[i + 3]:
if lines[i + 3] not in d.keys():
d[lines[i + 3].strip()] = [lines[i].strip()]
else:
d[lines[i + 3]].append(lines[i].strip())
else:
key = (lines[i + 3].strip().split(', '))
for j in key:
if j not in d.keys():
d[j] = [lines[i].strip()]
else:
d[j].append(lines[i].strip())
return d
Normally, I find that if you use names for the dictionary keys, you may have an easier time handling them later.
In the example below, I return a series of dictionaries, one for each restaurant. I also wrap the functionality of processing the values in a method called add_value(), to keep the code more readable.
In my example, I'm using codecs to decode the value. Although not necessary, depending on the characters you are dealing with it may be useful. I'm also using itertools to read the file lines with an iterator. Again, not necessary depending on the case, but might be useful if you are dealing with really big files.
import copy, itertools, codecs
class RestaurantListParser(object):
file_name = "restaurants.txt"
base_item = {
"_type": "undefined",
"_fields": {
"name": "undefined",
"nationality": "undefined",
"rating": "undefined",
"pricing": "undefined",
}
}
def add_value(self, formatted_item, field_name, field_value):
if isinstance(field_value, basestring):
# handle encoding, strip, process the values as you need.
field_value = codecs.encode(field_value, 'utf-8').strip()
formatted_item["_fields"][field_name] = field_value
else:
print 'Error parsing field "%s", with value: %s' % (field_name, field_value)
def generator(self, file_name):
with open(file_name) as file:
while True:
lines = tuple(itertools.islice(file, 5))
if not lines: break
# Initialize our dictionary for this item
formatted_item = copy.deepcopy(self.base_item)
if "," not in lines[3]:
formatted_item['_type'] = lines[3].strip()
else:
formatted_item['_type'] = lines[3].split(',')[1].strip()
self.add_value(formatted_item, 'nationality', lines[3].split(',')[0])
self.add_value(formatted_item, 'name', lines[0])
self.add_value(formatted_item, 'rating', lines[1])
self.add_value(formatted_item, 'pricing', lines[2])
yield formatted_item
def split_by_type(self):
d = {}
for restaurant in self.generator(self.file_name):
if restaurant['_type'] not in d:
d[restaurant['_type']] = [restaurant['_fields']]
else:
d[restaurant['_type']] += [restaurant['_fields']]
return d
Then, if you run:
p = RestaurantListParser()
print p.split_by_type()
You should get:
{
'Mexican': [{
'name': 'Mexican Grill',
'nationality': 'undefined',
'pricing': '$$',
'rating': '85%'
}],
'Pub Food': [{
'name': 'Georgie Porgie',
'nationality': 'Canadian',
'pricing': '$$$',
'rating': '87%'
}, {
'name': 'Deep Fried Everything',
'nationality': 'undefined',
'pricing': '$',
'rating': '52%'
}],
'Thai': [{
'name': 'Queen St. Cafe',
'nationality': 'Malaysian',
'pricing': '$',
'rating': '82%'
}]
}
Your solution is simple, so it's ok. I'd just like to mention a couple of ideas that come to mind when I think about this kind of problem.
Here's another take, using defaultdict and split to simplify things.
from collections import defaultdict
record_keys = ['name', 'rating', 'price', 'cuisine']
def load(file):
with open(file) as file:
data = file.read()
restaurants = []
# chop up input on each blank line (2 newlines in a row)
for record in data.split("\n\n"):
fields = record.split("\n")
# build a dictionary by zipping together the fixed set
# of field names and the values from this particular record
restaurant = dict(zip(record_keys, fields))
# split chops apart the type cuisine on comma, then _.strip()
# removes any leading/trailing whitespace on each type of cuisine
restaurant['cuisine'] = [_.strip() for _ in restaurant['cuisine'].split(",")]
restaurants.append(restaurant)
return restaurants
def build_index(database, key, value):
index = defaultdict(set)
for record in database:
for v in record.get(key, []):
# defaultdict will create a set if one is not present or add to it if one does
index[v].add(record[value])
return index
restaurant_db = load('/var/tmp/r')
print(restaurant_db)
by_type = build_index(restaurant_db, 'cuisine', 'name')
print(by_type)

Resources