Can't get rid of blank rows in csv output - python-3.x

I've written a very tiny script in python scrapy to parse name, street and phone number displayed across multiple pages from yellowpage website. When I run my script i find it working smoothly. However, the only problem i encounter is the way data are getting scraped in csv output. It is always a line (row) gap between two rows. What I meant is: data are getting printed in every other row. Seeing the picture below you will get to know what I meant. If it were not for scrapy, I could have used [newline='']. But, unfortunately I am totally helpless here. How can i get rid of blank lines coming along in the csv output? Thanks in advance to take a look into it.
items.py includes:
import scrapy
class YellowpageItem(scrapy.Item):
name = scrapy.Field()
street = scrapy.Field()
phone = scrapy.Field()
Here is the spider:
import scrapy
class YellowpageSpider(scrapy.Spider):
name = "YellowpageSp"
start_urls = ["https://www.yellowpages.com/search?search_terms=Pizza&geo_location_terms=Los%20Angeles%2C%20CA&page={0}".format(page) for page in range(2,6)]
def parse(self, response):
for titles in response.css('div.info'):
name = titles.css('a.business-name span[itemprop=name]::text').extract_first()
street = titles.css('span.street-address::text').extract_first()
phone = titles.css('div[itemprop=telephone]::text').extract_first()
yield {'name': name, 'street': street, 'phone':phone}
Here is how the csv output looks like:
Btw, the command I'm using to get csv output is:
scrapy crawl YellowpageSp -o items.csv -t csv

You can fix it by creating a new FeedExporter. Change your settings.py as below
FEED_EXPORTERS = {
'csv': 'project.exporters.FixLineCsvItemExporter',
}
create a exporters.py in your project
exporters.py
import io
import os
import six
import csv
from scrapy.contrib.exporter import CsvItemExporter
from scrapy.extensions.feedexport import IFeedStorage
from w3lib.url import file_uri_to_path
from zope.interface import implementer
#implementer(IFeedStorage)
class FixedFileFeedStorage(object):
def __init__(self, uri):
self.path = file_uri_to_path(uri)
def open(self, spider):
dirname = os.path.dirname(self.path)
if dirname and not os.path.exists(dirname):
os.makedirs(dirname)
return open(self.path, 'ab')
def store(self, file):
file.close()
class FixLineCsvItemExporter(CsvItemExporter):
def __init__(self, file, include_headers_line=True, join_multivalued=',', **kwargs):
super(FixLineCsvItemExporter, self).__init__(file, include_headers_line, join_multivalued, **kwargs)
self._configure(kwargs, dont_fail=True)
self.stream.close()
storage = FixedFileFeedStorage(file.name)
file = storage.open(file.name)
self.stream = io.TextIOWrapper(
file,
line_buffering=False,
write_through=True,
encoding=self.encoding,
newline="",
) if six.PY3 else file
self.csv_writer = csv.writer(self.stream, **kwargs)
I am on Mac, so can't test its windows behavior. But if above doesn't work then change below part of code and set newline="\n"
self.stream = io.TextIOWrapper(
file,
line_buffering=False,
write_through=True,
encoding=self.encoding,
newline="\n",
) if six.PY3 else file

Related

Scrapy script not scrapping items

Im not sure why my script isnt scrapping any items, is the same script that im using for another website, maybe the classes im using are wrong.
`
import scrapy
import os
from scrapy.crawler import CrawlerProcess
from datetime import datetime
date = datetime.now().strftime("%d_%m_%Y")
class stiendaSpider(scrapy.Spider):
name = 'stienda'
start_urls = ['https://stienda.uy/tv']
def parse(self, response):
for products in response.css('.grp778'):
price = products.css('.precioSantander::text').get()
name = products.css('#catalogoProductos .tit::text').get()
if price and name:
yield {'name': name.strip(),
'price': price.strip()}
os.chdir('C:\\Users\\cabre\\Desktop\\scraping\\stienda\\data\\raw')
process = CrawlerProcess(
# settings={"FEEDS": {"items.csv": {"format": "csv"}}}
settings={"FEEDS": {f"stienda_{date}.csv": {"format": "csv"}}}
)
process.crawl(stiendaSpider)
process.start()
`
I tried several but I dont usnderstand why is not working..
I was able to get the name field, but the price attribute is rendered empty and filled in later from an ajax call. That is why it's not being extracted.
import scrapy
import os
from scrapy.crawler import CrawlerProcess
from datetime import datetime
date = datetime.now().strftime("%d_%m_%Y")
class stiendaSpider(scrapy.Spider):
name = 'stienda'
start_urls = ['https://stienda.uy/tv']
def parse(self, response):
for products in response.xpath('//div[#data-disp="1"]'):
name = products.css('.tit::text').get()
if name:
yield {'name': name.strip()}
You can see it if you look at the page source... all of the elements with the class 'precioSantander' are empty.

Add Image/thumbnail in my python dataframe

I am working on a project where I need to create a movie database.
I have created my database and imported the links from IMDB that redirect you to the webpage. I would like to add also, the main image/thumbnail of each movie so that I can use then the csv in Power BI.
However, I did not manage to do it:
I have tried this:
import requests
from bs4 import BeautifulSoup
import numpy as np
images = []
for i in df_database_url['Url Film']:
r = requests.get(i)
soup = BeautifulSoup(r.content, "html.parser")
images.append(image_url)
But my goal is to have a column that includes the thumbnail for each movie.
Assuming that i is an imdb movie url (the kind that starts with https://www.imdb.com/title), you can target the script tag that seems to contain a lot of the main information for the movie - you can get that with
# import json
image_url = json.loads(soup.select_one('script[type="application/ld+json"]').text)['image']
or, if we're more cautious:
# import json
scCont = [s.text for s in soup.select('script[type="application/ld+json"]') if '"image"' in s.text]
if scCont:
try:
scCont = json.loads(scCont[0])
if 'image' not in scCont:
image_url = None
print('No image found for', i)
else: image_url = scCont['image']
except Exception as e:
image_url = None
print('Could not parse movie info for', i, '\n', str(e))
else:
image_url = None
print('Could not find script with movie info for', i)
(and you can get the trailer thumbnail with scCont['trailer']['thumbnailUrl'])
This way, instead of raising an error if anything on the path to the expected info is unavailable, it will just add image_url as None; if you want it to halt and raise error in such cases, use the first version.
and then after the loop you can add in the column with something like
df_database_url['image_urls'] = images
(you probably know that...)

How do I read a request.FILES into DataSource in Geodjango

So, the goal is to create a webpage to load a .shp file into and get a summary of some calculations as a JsonResponse. I have prepared the calculations and everything and it works nicely when I add a manual path to the file in question. However, the goal is for someone else to be able to upload the data and get back the response so I can't hardcode my path.
The overall approach:
Read in a through forms.FileField() and request.FILES['file_name']. After this, I need to transfer this request.FILES object to DataSource in order to read it in. I would rather not upload the file on pc if possible but work directly from the memory.
forms.py
from django import forms
from django.core.files.storage import FileSystemStorage
class UploadFileForm(forms.Form):
# title = forms.CharField(max_length=50)
file = forms.FileField()
views.py
import json
import os
from django.http import Http404, HttpResponse, HttpResponseRedirect
from django.shortcuts import render
from django.template import loader
from django.contrib import messages
from django.views.generic import TemplateView
from django.http import JsonResponse
from django.conf import settings
from .forms import UploadFileForm
from . import models
from django.shortcuts import redirect
from gisapp.functions.functions import handle_uploaded_file, handle_uploaded_file_two
from django.contrib.gis.gdal import DataSource
from django.core.files.uploadedfile import UploadedFile, TemporaryUploadedFile
import geopandas as gpd
import fiona
def upload_file(request):
if request.method == 'POST':
form = UploadFileForm(request.POST, request.FILES)
if form.is_valid():
f = request.FILES['file']
# geo2 = gpd.read_file(f)
# print(geo2)
# f_path = os.path.abspath(os.path.join(os.path.dirname(f), f))
# f_path = TemporaryUploadedFile.temporary_file_path(UploadedFile(f))
# print(f_path)
# f_path = f.temporary_file_path()
# new_path = request.FILES['file'].temporary_file_path
# print(f'This is file path: {f_path}')
# print(f'This is file path: {new_path}')
# data = DataSource(f'gisapp/data/{f}') -- given an absolute path it works great
data = DataSource(f) -- constantly failing
# data = DataSource(new_path)
# print(f'This is file path: {f_path}')
layer = data[0]
if layer.geom_type.name == "Polygon" or layer.geom_type.name == "LineString":
handle_uploaded_file(request.FILES['file'])
elif layer.geom_type.name == "Point":
handle_uploaded_file_two(request.FILES['file'])
return JsonResponse({"Count": f"{handle_uploaded_file_two(request.FILES['file'])[0]}", "Bounding Box": f"{handle_uploaded_file_two(request.FILES['file'])[1]}"})
# return JsonResponse({"Count": f"{handle_uploaded_file(request.FILES['file'])[0]}", "Minimum": f"{handle_uploaded_file(request.FILES['file'])[1]}", "Maximum": f"{handle_uploaded_file(request.FILES['file'])[1]}"})
# instance = models.GeometryUpload(file_field=request.FILES['file'])
# instance.save()
# # return HttpResponseRedirect('/success/')
else:
form = UploadFileForm()
return render(request, 'upload.html', {'form': form})
Error I get:
django.contrib.gis.gdal.error.GDALException: Invalid data source input type: <class 'django.core.files.uploadedfile.InMemoryUploadedFile'>
Now as you can see from the upload_file() in views.py, I tried a multitude of operations and when I add an absolute path, it works, but besides that I can't seem to upload the file to DataSource so that I can use it in my later analysis.
Looking at how Django handles this, it doesn't appear possible to work off of an in memory file. The path to the file is passed to the C API for OGR which then handles opening the file and reading it in.
A possible solution that I am trying myself is to have the user zip their shape files (.shp,.shx.,dbf etc.) beforehand. The zip file is then uploaded and unzipped. The shp files can then be read. Hope this helps
I face the same problem and my workaround was to save the file upload by the user in a temporary folder, then pass the absolute path of the temporary file to my DataSource. After finish all my process with the temporary file, I deleted.
The downside of this method is the execution time, is slow.

Unit test case for file upload flask

I have created a flask application, where I am uploading a file and then predicting the type of the file. I want to write unit test case for the same. I am new to unit test in python and therefore very confused!. There are 2 parts to my code, the first is the Main function, which then calls the classification method.
main.py - here the file is being uploaded and then we call the func_predict function which returns the output
upload_parser = api.parser()
upload_parser.add_argument('file', location='files',
type=FileStorage, required=True)
#api.route('/classification')
#api.expect(upload_parser)
class classification(Resource):
def post(self):
"""
predict the document
"""
args = upload_parser.parse_args()
uploaded_file = args['file']
filename = uploaded_file.filename
prediction,confidence = func_predict(uploaded_file)
return {'file_name':filename,'prediction': prediction,'confidence':confidence}, 201
predict.py : this file contains the func_predict function which does the actual prediction work. It takes the uploaded file as an input
def func_predict(file):
filename = file.filename #filename
extension = os.path.splitext(filename)[1][1:].lower() #file_extension
path = os.path.join(UPLOAD_FOLDER, filename) #store the temporary path of the file
output = {}
try:
# Does some processing.... some lines which are not relevant and then returns the two values
return (''.join(y_pred),max_prob)
Now my confusion is, How do i mock the uploaded file, the uploaded file is of FileStorage type. Also which method should i perform the testing for, should it be the '/classification' or the func_predict.
I have tried the below method, though I did not get any success in this.
I created a test.py file and imported the classification method from main.py and then passed a filename to the data
from flask import Flask, Request
import io
import unittest
from main import classification
class TestFileFail(unittest.TestCase):
def test_1(self):
app = Flask(__name__)
app.debug = True
app.request_class = MyRequest
client = app.test_client()
resp = client.post(
'/classification',
data = {
'file': 'C:\\Users\\aswathi.nambiar\\Desktop\\Desktop docs\\W8_ECI_1.pdf'
}, content_type='multipart/form-data'
)
print(resp.data)
self.assertEqual(
'ok',
resp.data,
)
if __name__ == '__main__':
unittest.main()
I am completely lost! I know there have been earlier questions, but I am not able to figure out .
I have finally stumbled upon how to test it, in case anybody was looking out for something similar.
from predict_main_restplus import func_predict
from werkzeug.datastructures import FileStorage
file = None
def test_classification_correct():
with open('W8-EXP_1.pdf', 'rb') as fp:
file = FileStorage(fp)
a , b = func_predict(file)
assert (a, b) == ('W-8EXP',90.15652760121652)
So, here we are testing the prediction function in predict.py, it returns two values, prediction result and the confidence of the prediction. We can mock the upload using open(file) and then wrapping it with FileStorage. This worked for me.

How to display proper output when using re.findall() in python?

I made a python script to get the latest stock price from the Yahoo Finance.
import urllib.request
import re
htmlfile = urllib.request.urlopen("http://finance.yahoo.com/q?s=GOOG");
htmltext = htmlfile.read();
price = re.findall(b'<span id="yfs_l84_goog">(.+?)</span>',htmltext);
print(price);
It works smoothly but when I output the price it comes out like this[b'1,217.04']
This might be a petty issue to ask but I'm new to python scripting so please help me if you can.
I want to get rid of 'b'. If I remove 'b' from b'<span id="yfs_l84_goog">"then it shows this error.
File "C:\Python33\lib\re.py", line 201, in findall
return _compile(pattern, flags).findall(string)
TypeError: can't use a string pattern on a bytes-like object
I want the output to be just
1,217.04
b'' is a syntax for bytes literals in Python. It is how you could define byte sequences in Python source code.
What you see in the output is the representation of the single bytes object inside the price list returned by re.findall(). You could decode it into a string and print it:
>>> for item in price:
... print(item.decode()) # assume utf-8
...
1,217.04
You could also write bytes directly to stdout e.g., sys.stdout.buffer.write(price[0]).
You could use an html parser instead of a regex to parse html:
#!/usr/bin/env python3
import cgi
from html.parser import HTMLParser
from urllib.request import urlopen
url = 'http://finance.yahoo.com/q?s=GOOG'
def is_price_tag(tag, attrs):
return tag == 'span' and dict(attrs).get('id') == 'yfs_l84_goog'
class Parser(HTMLParser):
"""Extract tag's text content from html."""
def __init__(self, html, starttag_callback):
HTMLParser.__init__(self)
self.contents = []
self.intag = None
self.starttag_callback = starttag_callback
self.feed(html)
def handle_starttag(self, tag, attrs):
self.intag = self.starttag_callback(tag, attrs)
def handle_endtag(self, tag):
self.intag = False
def handle_data(self, data):
if self.intag:
self.contents.append(data)
# download and convert to Unicode
response = urlopen(url)
_, params = cgi.parse_header(response.headers.get('Content-Type', ''))
html = response.read().decode(params['charset'])
# parse html (extract text from the price tag)
content = Parser(html, is_price_tag).contents[0]
print(content)
Check whether yahoo provides API that doesn't require web scraping.
Okay after searching for a while. I found the solution.Works fine for me.
import urllib.request
import re
htmlfile = urllib.request.urlopen("http://finance.yahoo.com/q?s=GOOG");
htmltext = htmlfile.read();
pattern = re.compile('<span id="yfs_l84_goog">(.+?)</span>');
price = pattern.findall(str(htmltext));
print(price);

Resources