Parsing google news via the tor network

Parsing google news via the tor network - python-3.x

I need to view the news daily by a list of keywords. I use GNews for this. After a certain number of requests, google blocks my access to the news.
I want to try using the tor network for parsing. But I can't get GNews to use the tor network. Maybe there are some ready-made solutions? Or tell me what changes need to be made to the GNews code. Or maybe there are other options?
I am not a programmer, I am trying to automate some of my actions.
My code
import datetime
import psycopg2
from gnews import GNews
c = 0
format = '%a, %d %b %Y %H:%M:%S GMT'
countries_gnews = ['US', 'AU', 'BW', 'CA', 'ET', 'GH', 'IN', 'ID', 'IE', 'IL', 'KE', 'LV', 'MY', 'NA', 'NZ', 'NG', 'PK', 'PH', 'SG', 'ZA', 'TZ', 'UG', 'GB', 'ZW', 'CZ', 'DE', 'AT', 'CH', 'AR', 'CL', 'CO', 'CU', 'MX', 'PE', 'VE', 'BE', 'FR', 'MA', 'SN', 'IT', 'LT', 'HU', 'NL', 'NO', 'PL', 'BR', 'PT', 'RO', 'SK', 'SI', 'SE', 'VN', 'TR', 'GR', 'BG', 'RU', 'UA', 'RS', 'AE', 'SA', 'LB', 'EG', 'BD', 'TH', 'CN', 'TW', 'HK', 'JP', 'KR']
keywords_gnews = [
'keyword1',
'keyword2',
'keyword200'
]
date_start = datetime.date(2022, 10, 1)
date_end = datetime.date(2022, 10, 2)
for country_gnews in countries_gnews:
print(country_gnews)
google_news = GNews(language='en', country=country_gnews, start_date=date_start, end_date=date_end)
print(google_news)
for keyword_gnews in keywords_gnews:
json_resps = google_news.get_news(keyword_gnews)
i = 0
try:
connection = psycopg2.connect(user="******",
password="********",
host="127.0.0.1",
port="*****",
database="*******")
cursor = connection.cursor()
# Print PostgreSQL Connection properties
print(connection.get_dsn_parameters(), "\n")
# Print PostgreSQL version
cursor.execute("SELECT version();")
record = cursor.fetchone()
print("You are connected to - ", record, "\n")
except (Exception, psycopg2.Error) as error:
print("Error while connecting to PostgreSQL", error)
#print(keyword_gnews, country_gnews)
for json_resp in json_resps:
i = i + 1
title = json_resp['title']
description = json_resp['description']
url = json_resp['url']
published_date = json_resp['published date']
date_publ = datetime.datetime.strptime(published_date, format)
publisher = str(json_resp['publisher'])
source = 'gnews'
sql = "insert into ***** (title, description, url, date_publ, publisher, country_gnews, keyword_gnews, position, source) values (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
cursor.execute(sql, (title, description, url, date_publ, publisher, country_gnews, keyword_gnews, i, source))
print('********************')
connection.commit()
Installed tor, tried to make changes to gnews.py but my knowledge was not enough for the program to send requests through the tor network. Windows OS

Related

Can't get telnetlib to read when called the second time on an instance

I am trying to write a python script that opens a telnet session and then answers addition problems presented in the session.
from telnetlib import Telnet
from pathlib import Path
host = 'fcd070be5ee67a1a.247ctf.com'
port = 50028
tn = Telnet(host, port) #Sets the telnet connection
path = Path('log.txt')
def get_line():
#Grabs and returns everything from the initial print of the telnet terminal and returns it.
raw_line = tn.read_until(b'?', timeout = 5)
path.write_text(str(raw_line))
line = str(raw_line.split()).replace('b', '').replace('?', '')
print(line)
return line
def get_nums():
#Cleans up, strips, and sums the numbers pulled from the line in get_line()
line = get_line()
ans = int(line.split(' ')[-1].replace('\'', '').replace(']', '')) + int(line.split(' ')[-3].replace('\'', '').replace(',', ''))
print(ans)
return ans
def answer():
#prints the sum from get_nums() into the telnet terminal and enters
tn.write(bytes(get_nums()) + b'\n')
answer()
get_line()
#Test to see if calling the instance outside of get_lines() would work
raw_line = tn.read_until(b'?', timeout = 5)
print(raw_line)
When get_lines() is called for the first time when answer() is ran, the code reads from the telnet session and answers a question just fine, but the second time that get_lines() is called, it only returns the characters: b'\r\n' in the raw_line variable that prints to the test log file, and prints an empty bracket on my terminal.
Example output when script is ran:
>PS C:\Users\jeffy\Documents\Python\247CTF> py .\telscript.py
>['Welcome', 'to', 'the', '247CTF', 'addition', 'verifier!', 'If', 'you', 'can', 'solve', '500', 'addition', 'prolems,', 'we', 'will', 'give', 'you', 'a', 'flag!', 'What', 'is', 'the', 'answer', 'to', '146', '+', '9']
>155
>[]
>b''
>PS C:\Users\jeffy\Documents\Python\247CTF>
Example of output when answered by hand via telnet terminal:
>Welcome to the 247CTF addition verifier!
>If you can solve 500 addition problems, we will give you a flag!
>What is the answer to 426 + 141?
>567
>Yes, Correct!
>What is the answer to 357 + 263?
I have tried calling the instance both inside, and outside of my function. This had no effect.
I have also tried stripping every line outside of their functions and it has only modified its output a bit.
from telnetlib import Telnet
from pathlib import Path
#Sets the telnet connection:
host = 'fcd070be5ee67a1a.247ctf.com'
port = 50028
tn = Telnet(host, port)
path = Path('log.txt')
#Gets and cleans the initial line
raw_line = tn.read_until(b'?', timeout = 1)
path.write_text(str(raw_line))
line = str(raw_line.split()).replace('b', '').replace('?', '')
ans = int(line.split(' ')[-1].replace('\'', '').replace(']', '')) + int(line.split(' ')[-3].replace('\'', '').replace(',', ''))
print(line)
print(ans)
#Plugs the answer in to the telnet session
str_ans = str(ans)
tn.write(bytes(str_ans + '\n', 'utf-8'))
#Attempts to get next line
new_line = tn.read_until(b'?', timeout = 5)
print(new_line)`
Outputs:
>PS C:\Users\jeffy\Documents\Python\247CTF> py .\test.py
>['Welcome', 'to', 'the', '247CTF', 'addition', 'verifier!', 'If', 'you', 'can', 'solve', '500', 'addition', 'prolems,', 'we', 'will', 'give', 'you', 'a', 'flag!', 'What', 'is', 'the', 'answer', 'to', '170', '+', '486']
>656
>b'\r\n'
>PS C:\Users\jeffy\Documents\Python\247CTF>
Thank you for your help!

Select a page with a keyword and scrape it

I'm trying to scrape the title of an item on a foreign version of a site.
After I run the Python script, the cli launches but returns nothing at all.
In iPython, to get the title, title = soup.find('a', {'class': 'vip'}).text works great on its own but it doesn't in Pycharm within the full code even though I went to my settings to download the BeautifulSoup package for my current interpreter.
Any idea why? Thanks.
#!/usr/bin/python3
import csv
import time
import requests
from bs4 import BeautifulSoup
product_category = input("Enter your product category: ")
def get_page(url):
response = requests.get(url)
if not response.ok:
print('Server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def get_detail_data(soup):
# title
try:
title = soup.find('a', {'class': 'vip'}).text
except:
title = ''
# price
try:
price = soup.find_all('span', {'itemprop': 'price'})
for p in price:
price = p.get('content')
except:
price = ''
# currency
try:
currency = soup.find_all('span', {'class': 'bold'}).text
except:
currency = ''
# items sold
try:
i_s = soup.find('div', {'class': 'hotness-signal red'}).text
items_sold = i_s.strip().split(' ')[0]
except:
items_sold = ''
data = {
'title': title,
'price': price,
'currency': currency,
'total sold': items_sold
}
return data
def get_index_data(soup):
try:
links = soup.find_all('a', class_='s-item__link')
except:
links = []
urls = [item.get('href') for item in links]
return urls
def write_csv(data, url):
with open('output.csv', 'a') as csvfile:
writer = csv.writer(csvfile)
row = [data['title'], data['price'], data['currency'], data['total sold'], url]
writer.writerow(['Title', 'Price', 'Currency', 'Sales Volume', 'URL'])
writer.writerow(row)
def main():
# Store URL formats for each search engine with placeholders
url = f"https://www.ebay.fr/sch/i.html?_nkw={product_category}&_pgn=1"
print(url)
products = get_index_data(get_page(url))
for link in products:
time.sleep(7)
data = get_detail_data(get_page(link))
print(data)
write_csv(data, link)
if __name__ == '__main__':
main()

Seems that .fr site uses different markup, so you need to change classnames/attributes accordingly.
For example:
import re
import csv
import time
import requests
from bs4 import BeautifulSoup
product_category = input("Enter your product category: ")
def get_page(url):
response = requests.get(url)
if not response.ok:
print('Server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def get_detail_data(soup):
# title
try:
title = soup.select_one('h1[itemprop="name"]')
for span in title.select('span'):
span.extract()
title = title.get_text(strip=True)
except:
title = ''
# price
try:
price = soup.find_all('span', {'itemprop': 'price'})
for p in price:
price = p.get('content')
except:
price = ''
# currency
try:
currency = soup.select_one('span[itemprop="priceCurrency"][content]')["content"]
except:
currency = ''
# items sold
try:
items_sold = re.findall(r'\d+', soup.select_one('.soldwithfeedback').text)[0]
except:
items_sold = ''
data = {
'title': title,
'price': price,
'currency': currency,
'total sold': items_sold
}
return data
def get_index_data(soup):
links = soup.select('.sresult h3 a')
urls = [item.get('href') for item in links]
return urls
def write_csv(data, url):
with open('output.csv', 'a') as csvfile:
writer = csv.writer(csvfile)
row = [data['title'], data['price'], data['currency'], data['total sold'], url]
writer.writerow(['Title', 'Price', 'Currency', 'Sales Volume', 'URL'])
writer.writerow(row)
def main():
# Store URL formats for each search engine with placeholders
url = f"https://www.ebay.fr/sch/i.html?_nkw={product_category}&_pgn=1"
print(url)
products = get_index_data(get_page(url))
for link in products:
time.sleep(0.5)
data = get_detail_data(get_page(link))
print(data)
# write_csv(data, link) # <-- I commented it, to just print to screen
if __name__ == '__main__':
main()
Prints:
Enter your product category: ddr4
https://www.ebay.fr/sch/i.html?_nkw=ddr4&_pgn=1
{'title': '16 Go 8 Go 4 Go DDR3 DDR4 1333 1600 1866 2133 RAM 2400 2666 MHz pour HyperX FURY Lot', 'price': '19.74', 'currency': 'USD', 'total sold': '1'}
{'title': '4 Go 8 Go 16 Go DDR4 2133 2400 2666 Mhz pour HyperX FURY DIMM Desktop Mémoire RAM Lot', 'price': '23.87', 'currency': 'USD', 'total sold': '93'}
{'title': '8 Go DDR4 2133 MHz pour HyperX FURY CL15 288 Pin DIMM PC4-17000 Desktop RAM RL1US', 'price': '39.96', 'currency': 'USD', 'total sold': '17'}
{'title': '16 Go G. Skill DDR4 Trident 3200 MHz Z PC4-25600 CL16 1.35 V Double Kit (2x8GB)', 'price': '70.0', 'currency': 'GBP', 'total sold': ''}
{'title': 'DDR4 4 Go 8 Go 16 Go Desktop 2666 MHz Desktop DIMM Mémoire RAM pour Kingston HyperX Fury R1US', 'price': '24.13', 'currency': 'USD', 'total sold': '19'}
{'title': 'Micron 8GB RAM DDR4 1Rx8 PC4-2400T-UAB-10', 'price': '23.0', 'currency': 'EUR', 'total sold': ''}
{'title': 'PATRIOT Viper Blackout 16 Go DDR4 3000 (2x8)', 'price': '54.99', 'currency': 'GBP', 'total sold': ''}
{'title': 'Samsung 8GB RAM DDR4 1Rx8 PC4-2133P SO-DIMM', 'price': '21.0', 'currency': 'EUR', 'total sold': ''}
{'title': 'Kingston 8 Go DDR4 2133 MHz Desktop PC RAM ~~ PC4 17000 Mémoire 2133P 288 broches 2Rx8', 'price': '31.99', 'currency': 'GBP', 'total sold': ''}
...and so on.

Iteration failure when using BeautifulSoup

I'm using BeautifulSoup to try to extract data from a web page. But for some reason it fails to iterate over items found in season greater than 1. There is seemingly no reason for this behavior as the nodes look exactly the same to me.
def scrape_show(show):
source = requests.get(show.url).text
soup = BeautifulSoup(source, 'lxml')
# All seasons and episodes
area = soup.find('div', class_='play_video-area-aside play_video-area-aside--related-videos play_video-area-aside--related-videos--titlepage')
for article in area:
if "season" in article.get('id'):
season = article.h2.a.find('span', class_='play_accordion__section-title-inner').text
print(season + " -- " + article.get('id'))
# All content for the given season
ul = article.find('ul')
if ul is None:
print("null!") # This should not happen
Example Output:
Season 1 -- section-season1-xxxx
Season 2 -- section-season2-xxxx
null!
https://www.svtplay.se/andra-aket (url from example)

The data is not available in HTML form for all seasons, only for season 1. But the information is embedded in the page in JSON form. You can parse this data with re and json module:
import re
import json
import requests
url = 'https://www.svtplay.se/andra-aket?tab=season-1-18927182'
data = json.loads( re.findall(r"root\['__svtplay_apollo'\] = (\{.*?\});", requests.get(url).text)[0] )
from pprint import pprint
# pprint(data) # <-- uncommment this to see all the data
for k in data:
if k.startswith('Episode:') or (k.startswith('$Episode:') and k.endswith('urls')):
print(k)
pprint(data[k])
print('-' * 80)
Prints (data about episodes 1 and 2 and their URLs):
Episode:1383301-001
{'__typename': 'Episode',
'accessibilities': {'json': ['AudioDescribed', 'SignInterpreted'],
'type': 'json'},
'duration': 1700,
'id': '1383301-001',
'image': {'generated': False,
'id': 'Image:18926434',
'type': 'id',
'typename': 'Image'},
'live': None,
'longDescription': 'Madde och Petter flyttar tillsammans med sin 13-åriga '
'dotter Ida till Björkfjället, en liten skidort i svenska '
'fjällen. Madde är uppvuxen där men för '
'Stockholms-hipstern Petter är det ett chockartat '
'miljöombyte. Maddes mamma Ingegerd har gått i pension och '
'lämnat över ansvaret för familjens lilla hotell till '
'Madde. Hon och Petter ska nu driva "Gammelgården" med '
'Maddes bror Tommy, vilket visar sig vara en inte helt '
'lätt uppgift. I rollerna: Sanna Sundqvist, Jakob '
'Setterberg, William Spetz, Bert-Åke Varg, Mattias '
'Fransson och Lena T Hansson. Del 1 av 8.',
'name': 'Avsnitt 1',
'nameRaw': '',
'positionInSeason': 'Säsong 1 — Avsnitt 1',
'restrictions': {'generated': True,
'id': '$Episode:1383301-001.restrictions',
'type': 'id',
'typename': 'Restrictions'},
'slug': 'avsnitt-1',
'svtId': 'jBD1gw8',
'urls': {'generated': True,
'id': '$Episode:1383301-001.urls',
'type': 'id',
'typename': 'Urls'},
'validFrom': '2019-07-25T02:00:00+02:00',
'validFromFormatted': 'Tor 25 jul 02:00',
'validTo': '2020-01-21T23:59:00+01:00',
'variants': [{'generated': False,
'id': 'Variant:1383301-001A',
'type': 'id',
'typename': 'Variant'},
{'generated': False,
'id': 'Variant:1383301-001S',
'type': 'id',
'typename': 'Variant'},
{'generated': False,
'id': 'Variant:1383301-001T',
'type': 'id',
'typename': 'Variant'}],
'videoSvtId': '8PbQdAj'}
--------------------------------------------------------------------------------
$Episode:1383301-001.urls
{'__typename': 'Urls',
'svtplay': '/video/19970142/andra-aket/andra-aket-sasong-1-avsnitt-1'}
--------------------------------------------------------------------------------
... and so on.

Python XML-RPC Export Import Program

I am trying to pass a many2one field to another database. Right now my program is using xmlrpc to get into a database and fetch data from a table called product.template and then creates a csv. The field that I want to pass returns
"[54, 'PARTS / COST']" along with the other fields. I only need the 54 which is the id. Any idea where to intercept this issue or how to solve it? this are the two methods that I got so far.
def FetchProducts(self):
products = self.odoo_object.execute_kw(
self.db,
self.uid,
self.password,
'product.template',
'search_read',
[[['sale_ok', '=', True], ['purchase_ok', '=', True]]],
{'fields': ['id', 'name', 'sale_ok', 'purchase_ok', 'type', 'default_code', 'barcode', 'list_price', 'standard_price', 'categ_id'], 'limit': 1}
)
return products
def ProductsCSV(self,products):
csv_columns = ['id', 'name', 'sale_ok', 'purchase_ok', 'type', 'default_code', 'barcode', 'list_price', 'standard_price', 'categ_id']
csv_file = "Products.csv"
try:
with open(csv_file, 'w', encoding='utf-8', newline='') as csvfile:
writer = csv.DictWriter(csvfile, csv_columns, delimiter=',')
writer.writeheader()
for data in products:
writer.writerow(data)
print("Writing Products " + str(data))
except IOError:
print("I/O error")

I think you have problem in this line:-
--> with open(csv_file, 'w', encoding='utf-8', newline='') as csvfile:
so in my opinion please remove the arguments :- encoding and newline.
I run your code with project module it's perfectly worked.

Compare two strings by meaning

Are there any solutions how to compare short strings not by characters, but by meaning? I've tried to google it, but all search results are about comparing characters, length and so on.
I'm not asking you about ready-to-use solutions, just show me the way, where I need "to dig".
Thank you in advance.

Your topic is not clear enough. When you compare string by meaning, you need to define the level of equal. for example "I have 10 dollars" and "there are 10 dollars in my pocket. Are they equal in your definition? sometimes there is implied meaning in the string.

Answer to a very similar closed question, that wants to compare the context between two lists ['apple', 'spinach', 'clove'] and ['fruit', 'vegetable', 'spice'], that uses the Google Knowledge Graph Search API:
import json
from urllib.parse import urlencode
from urllib.request import urlopen
def get_descriptions_set(query: str) -> set[str]:
descriptions = set()
kg_response = get_kg_response(query)
for element in kg_response['itemListElement']:
if 'description' in element['result']:
descriptions.add(element['result']['description'].lower())
return descriptions
def get_kg_response(query: str) -> str:
api_key = open('.api_key').read()
service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
params = {
'query': query,
'limit': 10,
'indent': True,
'key': api_key,
}
url = f'{service_url}?{urlencode(params)}'
response = json.loads(urlopen(url).read())
return response
def main() -> None:
list_1 = ['apple', 'spinach', 'clove']
list_2 = ['fruit', 'vegetable', 'spice']
list_1_kg_descrpitons = [get_descriptions_set(q) for q in list_1]
print('\n'.join(f'{q} {descriptions}'
for q, descriptions in zip(list_1, list_1_kg_descrpitons)))
list_2_matches_context = [
d in descriptions
for d, descriptions in zip(list_2, list_1_kg_descrpitons)
]
print(list_2_matches_context)
if __name__ == '__main__':
main()
Output:
apple {'watch', 'technology company', 'fruit', 'american singer-songwriter', 'digital media player', 'mobile phone', 'tablet computer', 'restaurant company', 'plant'}
spinach {'video game', 'plant', 'vegetable', 'dish'}
clove {'village in england', 'spice', 'manga series', 'production company', '2018 film', 'american singer-songwriter', '2008 film', 'plant'}
[True, True, True]

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Parsing google news via the tor network - python-3.x

Related

Can't get telnetlib to read when called the second time on an instance

Select a page with a keyword and scrape it

Iteration failure when using BeautifulSoup

Python XML-RPC Export Import Program

Compare two strings by meaning

Categories

Resources