flask server unresponsive after second load - python-3.x

After I implemented caching on my flask server, everything works perfectly on local host. First execution is 8000ms, second is 26ms, therefore its working.
When I deployed the application on the AWS ec2 box, the first execution is 21000ms and whenever I try to run it again, it comes with server not responsive.
This is the code:
#!flask/bin/python
from flask_cache import Cache
from flask import Flask, jsonify
from flask import request
from flask_caching import Cache
import json
import nltk, string
import operator
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import time
import access_json
app = Flask(__name__)
cache = Cache(app, config={'CACHE_TYPE': 'simple'})
with open('JSON files/thesaurus.json', 'r') as fp:
thesaurus_dict = json.load(fp)
with open('JSON files/JOBS.json', 'r') as f:
json_list = json.load(f)
def output(word_list):
return filter_toplist
#app.route('/postjson', methods=['POST'])
#cache.cached(timeout=20)
def json_handler():
content = request.get_json(force=True)
word_list = access_json.read_parsed_JSON(content)
return jsonify ({'jobs': output(word_list)})
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0')
This is not all of the code but the one relevant to the flask server.

Related

cannot import name python 3.10

Hi I have created a main page with python with the code of:
from website import create_app
app = create_app()
if __name__ == '__main__':
app.run(debug=True)
Second file with the code of:
from flask import Flask
def create_app():
app = Flask(__name__)
app.config['SECRET_kEY'] = 'computer1'
return app
I have enter from website import 'create app' and I am getting an error message which states
cannot import name 'createapp'
python\python 310\lib\site.package\website__init__py
When I press run the above comes up. Can someone please advise?

Cannot run a spider successfully after finishing scraping data by another spider through running a script

#I am following code from this previous stackoverflow posts:
How to schedule Scrapy crawl execution programmatically
Running Scrapy multiple times in the same process
##The following script works well while using one spider:
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from AmazonScrap.spiders.Amazonfeed import AmazonfeedSpider
from scrapy.utils.project import get_project_settings
from twisted.internet.defer import inlineCallbacks
from urllib.parse import urlparse
# from AmazonScrap.spiders.Productfeed import ProductfeedSpider
import yaml
from urllib.parse import urlencode
with open(r'C:\Users\Latitude\Desktop\Shadman\Scrapy_Projects\Product_List.yaml') as file:
PList = yaml.load(file, Loader=yaml.FullLoader)
Purl= []
for k, v in PList.items():
arg = v['M_title']
args = {"k": arg}
amazon_url= 'https://www.amazon.com/s?{}'.format(urlencode(args))
Purl.append(amazon_url)
print(Purl)
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner(settings = get_project_settings())
#inlineCallbacks
def loop_urls(urls):
for url in urls:
yield runner.crawl(AmazonfeedSpider, url)
# reactor.stop()
loop_urls(Purl)
reactor.run()
enter code here
##But this script doesn't even scrape successfully using the first spider.. and can't access the 2nd spider...
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from AmazonScrap.spiders.Amazonfeed import AmazonfeedSpider
from scrapy.utils.project import get_project_settings
from twisted.internet.defer import inlineCallbacks
from urllib.parse import urlparse
from AmazonScrap.spiders.Productfeed import ProductfeedSpider
import yaml
from urllib.parse import urlencode
# def crawl_job():
# """
# Job to start spiders.
# Return Deferred, which will execute after crawl has completed.
# """
# settings = get_project_settings()
# runner = CrawlerRunner(settings)
# return runner.crawl(AmazonfeedSpider)
def CrawlProduct():
settings = get_project_settings()
runner2 = CrawlerRunner(settings)
yield runner2.crawl(ProductfeedSpider)
reactor.stop()
def schedule_next_crawl(null, sleep_time):
"""
Schedule the next crawl
"""
reactor.callLater(sleep_time, CrawlProduct)
#inlineCallbacks
def loop_urls(urls):
"""
# Job to start spiders.
# Return Deferred, which will execute after crawl has completed.
# """
settings = get_project_settings()
runner = CrawlerRunner(settings)
for url in urls:
yield runner.crawl(AmazonfeedSpider, url)
# reactor.stop()
def crawl(Purl):
"""
A function that schedules a crawl 30 seconds after
each successful crawl.
"""
# loop_urls() returns a Deferred
d = loop_urls(Purl)
# call schedule_next_crawl(<scrapy response>, n) after crawl job is complete
d.addCallback(schedule_next_crawl, 30)
d.addErrback(catch_error)
def catch_error(failure):
print(failure.value)
if __name__=="__main__":
with open(r'C:\Users\Latitude\Desktop\Shadman\Scrapy_Projects\Product_List.yaml') as file:
PList = yaml.load(file, Loader=yaml.FullLoader)
Purl= []
for k, v in PList.items():
arg = v['M_title']
args = {"k": arg}
amazon_url= 'https://www.amazon.com/s?{}'.format(urlencode(args))
Purl.append(amazon_url)
print(Purl)
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
crawl(Purl)
reactor.run()
#Is it for not executing the inlineCallbacks function properly..? I am drawing the attention of altruistic experts and looking forward to their suggestions and solutions. please speculate the aforementioned stackoverflow questions and solutions first, before answering my question.

ValueError: Must be a coordinate pair or Point

i want to pass my latitude and langitude values to my flask route but everytime i am getting this error: ValueError: Must be a coordinate pair or Point
however i have tried this and its working fine:
from flask import Flask, render_template
from geopy.geocoders import Nominatim
app = Flask(__name__)
geolocator = Nominatim()
#app.route('/location')
def lang_and_lat():
location = geolocator.reverse("21.0943, 81.0337")
address = location.address
return render_template('ip.html', address=address)
if __name__ == '__main__':
app.run(debug=True)
from flask import Flask, render_template
from geopy.geocoders import Nominatim
app = Flask(__name__)
geolocator = Nominatim()
#app.route('/location/<lat>/<lang>')
def lang_and_lat(lat, lang):
location = geolocator.reverse(lat, lang)
address = location.address
return render_template('ip.html', address=address)
if __name__ == '__main__':
app.run(debug=True)
you need to do
location = geolocator.reverse(f'{lat}, {lang}')
or
location = geolocator.reverse(Point(lat, lang))
in second case you need to from geopy.point import Point

Why is multiprocessing not working with python dash framework - Python3.6

I'm trying to implement multiprocessing library for splitting up a dataframe into parts, process it on multiple cores of CPU and then concatenate the results back into a final dataframe in a python dash application. The code works fine when I try it outside of the dash application (when I run the code standalone without enclosing it in a dash application). But when I enclose the same code in a dash application, I get an error. I have shown the code below:
I have tried the multiprocessing code out of the dash framework and it works absolutely fine.
import dash
from dash.dependencies import Input, Output, State
import dash_core_components as dcc
import dash_html_components as html
import flask
import dash_table_experiments as dt
import dash_table
import dash.dependencies
import base64
import time
import os
import pandas as pd
from docx import *
from docx.text.paragraph import Paragraph
from docx.text.paragraph import Run
import xml.etree.ElementTree as ET
import multiprocessing as mp
from multiprocessing import Pool
from docx.document import Document as doctwo
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
import io
import csv
import codecs
import numpy as np
app = dash.Dash(__name__)
application = app.server
app.config.supress_callback_exceptions = True
app.layout = html.Div(children=[
html.Div([
html.Div([
html.H4(children='Reader'),
html.Br(),
],style={'text-align':'center'}),
html.Br(),
html.Br(),
html.Div([
dcc.Upload(html.Button('Upload File'),id='upload-data',style = dict(display = 'inline-block')),
html.Br(),
]
),
html.Div(id='output-data-upload'),
])
])
#app.callback(Output('output-data-upload', 'children'),
[Input('upload-data', 'contents')],
[State('upload-data', 'filename')])
def update_output(contents, filename):
if contents is not None:
content_type, content_string = contents.split(',')
decoded = base64.b64decode(content_string)
document = Document(io.BytesIO(decoded))
combined_df = pd.read_csv('combined_df.csv')
def calc_tfidf(input1):
input1 = input1.reset_index(drop=True)
input1['samplecol'] = 'sample'
return input1
num_cores = mp.cpu_count() - 1 #number of cores on your machine
num_partitions = mp.cpu_count() - 1 #number of partitions to split dataframe
df_split = np.array_split(combined_df, num_partitions)
pool = Pool(num_cores)
df = pd.concat(pool.map(calc_tfidf, df_split))
pool.close()
pool.join()
return len(combined_df)
else:
return 'No File uploaded'
app.css.append_css({'external_url': 'https://codepen.io/plotly/pen/EQZeaW.css'})
if __name__ == '__main__':
app.run_server(debug=True)
The above dash application takes as input any file. Upon uploading the file in the front end, a local CSV file (any file, in my case it is combined_df.csv) is loaded into a dataframe. Now I want to split the dataframe into parts using multiprocessing, process it and combine it back. But the above code results in the following error:
AttributeError: Can't pickle local object 'update_output..calc_tfidf'
What's wrong with this piece of code?
Okay I've figured it out now!. The problem is that the function calc_tfidf was not defined as a global function. I changed the function to be a global function and it worked perfect.
Simple checks when left unsolved at times might lead to days of redundant efforts! :(

Using Twisted + Cyclone + PyPy to handle POST requests cause a memory leak?

After a lot of investigating, I found out that after serving hundreds of thousands of HTTP POST requests, there's a memory leak. The strange part is that the memory leak only occurs when using PyPy.
Here's an example code:
from twisted.internet import reactor
import tornado.ioloop
do_tornado = False
port = 8888
if do_tornado:
from tornado.web import RequestHandler, Application
else:
from cyclone.web import RequestHandler, Application
class MainHandler(RequestHandler):
def get(self):
self.write("Hello, world")
def post(self):
self.write("Hello, world")
if __name__ == "__main__":
routes = [(r"/", MainHandler)]
application = Application(routes)
print port
if do_tornado:
application.listen(port)
tornado.ioloop.IOLoop.instance().start()
else:
reactor.listenTCP(port, application)
reactor.run()
Here is the test code I am using to generate requests:
from twisted.internet import reactor, defer
from twisted.internet.task import LoopingCall
from twisted.web.client import Agent, HTTPConnectionPool
from twisted.web.iweb import IBodyProducer
from zope.interface import implements
pool = HTTPConnectionPool(reactor, persistent=True)
pool.retryAutomatically = False
pool.maxPersistentPerHost = 10
agent = Agent(reactor, pool=pool)
bid_url = 'http://localhost:8888'
class StringProducer(object):
implements(IBodyProducer)
def __init__(self, body):
self.body = body
self.length = len(body)
def startProducing(self, consumer):
consumer.write(self.body)
return defer.succeed(None)
def pauseProducing(self):
pass
def stopProducing(self):
pass
def callback(a):
pass
def error_callback(error):
pass
def loop():
d = agent.request('POST', bid_url, None, StringProducer("Hello, world"))
#d = agent.request('GET', bid_url)
d.addCallback(callback).addErrback(error_callback)
def main():
exchange = LoopingCall(loop)
exchange.start(0.02)
#log.startLogging(sys.stdout)
reactor.run()
main()
Note that this code does not leak with CPython nor with Tornado and Pypy! The code leaks only when using Twisted and Pypy together, and ONLY when using a POST request.
To see the leak, you have to send hundreds of thousands of requests.
Note that when setting PYPY_GC_MAX, the process eventually crashes.
What's going on?
Turns out that the cause of the leak is the BytesIO module.
Here's how to simulate the leak on Pypy.
from io import BytesIO
while True: a = BytesIO()
Here's the fix:
https://bitbucket.org/pypy/pypy/commits/40fa4f3a0740e3aac77862fe8a853259c07cb00b

Resources