I have an ejs file that i call from node.js that seems to be doing the right thing, but not rendering the links properly.
here is the call from node.js:
res.render('response.ejs', {jsondata : isu});
here is the code on the response.ejs page
<div>
<% for(var i=0; i < jsondata.length; i++) { %>
<li>
<%= link_to(jsondata[i].time, "Entry/"+jsondata[i]._id) %>
<%= jsondata[i].location.place[0]%>
</li>
<% } %>
</div>
this is what it returns (displayed on the page):
<a href='Entry/5411f73ef0fd92861601775f' >Thu Sep 11 2014 12:25:50 GMT-0700 (PDT)</a> Woodlawn Ave N
<a href='Entry/541251980570a2ba17f02c89' >Thu Sep 11 2014 18:51:43 GMT-0700 (PDT)</a> Woodlawn Ave N
<a href='Entry/5412524d0570a2ba17f02c8b' >Thu Sep 11 2014 18:54:29 GMT-0700 (PDT)</a> Woodlawn Ave N
<a href='Entry/541253b50570a2ba17f02c8d' >Thu Sep 11 2014 19:00:32 GMT-0700 (PDT)</a> Bagley Ave N
<a href='Entry/541254c054041703194a957d' >Thu Sep 11 2014 19:04:48 GMT-0700 (PDT)</a> Russell Ave NW
<a href='Entry/541502b197ed2f8022f0b399' >Sat Sep 13 2014 19:51:29 GMT-0700 (PDT)</a> Woodlawn Ave N
but if i copy this output to a file and open it in the browser i do see the hrefs rendered as links so no idea what is going on here.
<%= %> escapes HTML, so you need to use <%- %> instead.
Related
I'm trying to search for 3 (or more) specific RegEx inside HTML documents.
The HTML files do all have different forms and layouts but specific words, so I can search for the words.
Now, I'd like to return the line:
<div>
<p>This 17 is A BIG test</p>
<p>This is another greaterly test</p>
<p>17738 that is yet <em>another</em> <strong>test</strong> with a CAR</p>
</div>
I've tried plenty of versions of the code but I'm stumbling in the dark currently.
import re
from bs4 import Tag, BeautifulSoup
text = """
<body>
<div>
<div>
<p>This 19 is A BIG test</p>
<p>This is another test</p>
<p>19 that is yet <em>another</em> great <strong>test</strong> with a CAR</p>
</div>
<div>
<p>This 17 is A BIG test</p>
<p>This is another greaterly test</p>
<p>17738 that is yet <em>another</em> <strong>test</strong> with a CAR</p>
</div>
</div>
</body>
"""
def searchme(bstag):
print("searchme")
regex1 = r"17738"
regex2 = r"CAR"
regex3 = r"greaterly"
switch1 = 0
switch2 = 0
switch3 = 0
result1 = bstag.find(string=re.compile(regex1, re.MULTILINE))
if len(result1) >= 1:
switch1 = 1
result2 = result1.parent.find(string=re.compile(regex2, re.MULTILINE))
if len(result2) >= 1:
switch2 = 1
result3 = result2.parent.find_all(string=re.compile(regex3, re.MULTILINE))
if len(result3) >= 1:
switch3 = 1
if switch1 == 1 and switch2 == 1 and switch3 == 1:
return bstag
else:
if bstag.parent is not None:
searchme(bstag.parent)
else:
searchme(result1.parent)
soup = BeautifulSoup(text, 'html.parser')
el = searchme(soup)
print(el)
EDIT 1
Updated the desired returned code
I am not sure to have understood the example, given that there is no element in the text object which contains all the 3 regex terms.
If, however, I have properly parsed the question, I would recommend not to use regex for this task (which is sub-optimal in terms of computational time and burden), but rely on a much simpler in. Below you can find a MWE, in which I have slightly modified the text in your original example to contain the line you are interested in.
from bs4 import Tag, BeautifulSoup
text = """
<body>
<div>
<div>
<p>This 19 is A BIG test</p>
<p>This is another test</p>
<p>19 that is yet <em>another</em> great <strong>test</strong> with a CAR</p>
</div>
<div>
<p>This 17 is A BIG test</p>
<p>This is another greaterly test</p>
<p>17738 that is yet <em>another</em> greaterly <strong>test</strong> with a CAR</p>
</div>
</div>
</body>
"""
t1 = '17738' # terms to be searched
t2 = 'CAR'
t3 = 'greaterly'
soup = BeautifulSoup(text, 'html.parser')
for row in soup.findAll('div'): # parse the text line by line
if t1 in row.text and t2 in row.text and t3 in row.text: # if the line contains all terms
print(row.text)
You can use CSS selector div:has(> p), which will search <div> tags that have <p> tags directly under them.
For example:
from bs4 import BeautifulSoup
text = """
<body>
<div>
<div>
<p>This 19 is A BIG test</p>
<p>This is another test</p>
<p>19 that is yet <em>another</em> great <strong>test</strong> with a CAR</p>
</div>
<div>
<p>This 17 is A BIG test</p>
<p>This is another greaterly test</p>
<p>17738 that is yet <em>another</em> <strong>test</strong> with a CAR</p>
</div>
</div>
</body>"""
to_search = ['17738', 'CAR', 'greaterly']
soup = BeautifulSoup(text, 'html.parser')
results = []
for div in soup.select('div:has(> p)'): # search only divs that have <p> tags DIRECTLY under them
if all(word in div.text for word in to_search):
results.append(div)
print(results)
Prints:
[<div>
<p>This 17 is A BIG test</p>
<p>This is another greaterly test</p>
<p>17738 that is yet <em>another</em> <strong>test</strong> with a CAR</p>
</div>]
Another method.
from simplified_scrapy import SimplifiedDoc
html = """
<body>
<div>
<div>
<p>This 19 is A BIG test</p>
<p>This is another test</p>
<p>19 that is yet <em>another</em> great <strong>test</strong> with a CAR</p>
</div>
<div>
<p>This 17 is A BIG test</p>
<p>This is another greaterly test</p>
<p>17738 that is yet <em>another</em> <strong>test</strong> with a CAR</p>
</div>
</div>
</body>
"""
regex1 = r"17738"
regex2 = r"CAR"
regex3 = r"greaterly"
doc = SimplifiedDoc(html)
p3s = doc.getElementsByReg(regex3,tag='p')
for p in p3s:
p2 = p.getNext('p')
if p2.contains([regex1,regex2],attr='html'):
# print (p2.outerHtml)
print (p2.parent.outerHtml) # Get div
break
Result:
<div>
<p>This 17 is A BIG test</p>
<p>This is another greaterly test</p>
<p>17738 that is yet <em>another</em> <strong>test</strong> with a CAR</p>
</div>
Here are more examples: https://github.com/yiyedata/simplified-scrapy-demo/tree/master/doc_examples
I've got this HTML
<div>
<div tabindex="0" class="dropdown xs-dropdown show" id="selBudgetYearRange-wrap" style="border: 1px solid rgb(206, 212, 218); border-image: none;">
<button tabindex="-1" class="btn btn-default dropdown-toggle" id="selBudgetYearRange-btn" aria-expanded="true" aria-haspopup="true" type="button" data-toggle="dropdown">2020 vs 2019 (Oct 2019 - Mar 2020 vs Oct 2018 - Mar 2019)
</button>
<div class="dropdown-menu show" id="selBudgetYearRange-dmenu" aria-labelledby="selBudgetYearRange-btn" style="left: 0px; top: 0px; width: calc(100% + 2 * 1px); margin-top: calc(0px + 1px); margin-left: calc(-0px - 1px); position: absolute; transform: translate3d(0px, 21px, 0px);" x-placement="bottom-start">
<a class="dropdown-item active" onclick="dropdownChanged(this, 'selBudgetYearRange','201910-202003,2');;return false;" href="javascript:void(0)">2020 vs 2019 (Oct 2019 - Mar 2020 vs Oct 2018 - Mar 2019)</a>
<a class="dropdown-item" onclick="dropdownChanged(this, 'selBudgetYearRange','201810-201909,2');;return false;" href="javascript:void(0)">2019 vs 2018 (Whole Year Oct - Sep)</a>
<a class="dropdown-item" onclick="dropdownChanged(this, 'selBudgetYearRange','201710-201809,2');;return false;" href="javascript:void(0)">2018 vs 2017 (Whole Year Oct - Sep)</a>
</div><input name="selBudgetYearRange" id="selBudgetYearRange" type="hidden" value="201910-202003,2"></div>
</div>
which has 3 options
2020 vs 2019 (Oct 2019 - Mar 2020 vs Oct 2018 - Mar 2019)
2019 vs 2018 (Whole Year Oct - Sep)
2018 vs 2017 (Whole Year Oct - Sep)
I've managed to work out that by using
.FindElementByXPath("//*[text()='2019 vs 2018 (Whole Year Oct - Sep)']").Click
I can select the middle option, however it only works if I click on the drop down first so that the options are visible.
I can't seem to get selenium to click on the drop down for me and make the options visible so that I can use the above xpath to select my desired option.
Please can someone advise on how to do this? Or if there's a way of selecting my options without needing to get the drop down to appear first?
So managed to get it to work.
by using clickdouble (after figuring out the xpath)
I have no idea why I need to use clickdouble instead of click.
But it works, so I'm not going to complain too much
I want to scrape a web page (German complaint website) using BeautifulSoup. Here is a good example (https://de.reclabox.com/beschwerde/44870-deutsche-bahn-berlin-erstattungsbetrag-sparpreisticket)
<div id="comments" class="kt">
<a name="comments"></a>
<span class="bb">Kommentare und Trackbacks (7)</span>
<br><br><br>
<a id="comment100264" name="comment100264"></a>
<div class="data">
19.12.2011 | 11:04
</div>
von Tom K.
<!--
-->
| <a class="flinko" href="/users/login?functionality_required=1">Regelverstoß melden</a>
<div class="linea"></div>
TEXT I AM INTEREST IN<br><br>MORE TEXT I AM INTEREST IN<br><br>MORETEXT I AM INTEREST IN
<br><br>
<a id="comment100265" name="comment100265"></a>
<div class="data">
19.12.2011 | 11:11
</div>
von Tom K.
<!--
-->
| <a class="flinko" href="/users/login?functionality_required=1">Regelverstoß melden</a>
<div class="linea"></div>
TEXT I AM INTEREST IN<br><br>MORE TEXT I AM INTEREST IN
<br><br>
<a id="comment101223" name="comment101223"></a>
<div class="commentbox comment-not-yet-solved">
<div class="data">
25.12.2011 | 10:14
</div>
von ReclaBoxler-4134668
<!--
--><img alt="noch nicht gelöste Beschwerde" src="https://a1.reclabox.com/assets/live_tracking/not_yet_solve-dbf4769c625b73b23618047471c72fa45bacfeb1cf9058655c4d75aecd6e0277.png" title="noch nicht gelöste Beschwerde">
| <a class="flinko" href="/users/login?functionality_required=1">Regelverstoß melden</a>
<div class="linea"></div>
TEXT I AM NOT INTERESTED IN <br><br>TEXT I AM NOT INTERESTED IN
</div>
<br><br>
<a id="comment101237" name="comment101237"></a>
<div class="data">
25.12.2011 | 11:01
</div>
von ReclaBoxler-3315297
<!--
-->
| <a class="flinko" href="/users/login?functionality_required=1">Regelverstoß melden</a>
<div class="linea"></div>
TEXT I AM INTERESTED IN
<br><br>
etc...
<br><br>
<br><br>
</div>
I was able to scrape most of the content I want (thanks to a lot of Q&A's I read here:-)) except for the comments (<div id="comments" class="kt">) which are not in a class ="commentbox" (I got the commentboxes already with another command). The comments outside the comment boxes seem to be not in a normal tag, that's why I just did not manage to get them via "soup.find(_all)". I'd like to scrape these comments as well as information about the person posting the comment ("von") as well as the date and time (<div class="data">).
It would be absolutely fantastic if someone knows how to solve this one. Thanks in advance for your help!
The common task to extract all texts from a page as follows
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
doc = """xxxxxxxx""" // url name
soup = BeautifulSoup(doc, "html.parser")
print(soup.get_text())
I have a problem to get the data from some page. This is part of my code:
for result in results:
street = result.find('p', attrs={'class':'size16'}).text
records.append((street))
print (street)
Website:
<div class="media-body pt5 pb10">
<div class="mb15">
<span class="map-item-city block mb0 colorgreen">City</span>
<p class="small mb20"> </p>
<p class="size16">street 98<br>phone. 22 721-56-70</p>
</div>
<div class="colorblack"><strong>open</strong></div>
<div class="mb20 size16">Mon.-Fr. 07.30-15.30</div>
<div class="mb15 ">
Result of my code:
ul. Bema 2phone. (32) 745 72 66-69 Wroclaw None
ul. 1 Maja 22/Vphone. 537-943-969 Olawa <p class="small mb20 colorgreen">Placowka partnerska</p>
I would like to separate or delete the text after a "br" tag. I need only 'street'
<p class="size16">street 98<br>phone. 22 721-56-70</p>
Can You help me?
Use previous_sibling like this:
from bs4 import BeautifulSoup
html = """
<div class="media-body pt5 pb10">
<div class="mb15">
<span class="map-item-city block mb0 colorgreen">Bronisze</span>
<p class="small mb20"> </p>
<p class="size16">Poznańska 98<br>tel. 22 721-56-70</p>
</div>
<div class="colorblack"><strong>Godziny otwarcia</strong></div>
<div class="mb20 size16">Pn.-Pt. 07.30-15.30</div>
<div class="mb15 ">
"""
result=BeautifulSoup(html, "lxml")
br = result.find('br')
print (br.previous_sibling)
Or if you want to narrow it down a bit:
street = result.find('p', attrs={'class':'size16'}).find('br').previous_sibling
print (street)
Outputs (in both cases)
Poznańska 98
From the documentation https://www.crummy.com/software/BeautifulSoup/bs4/doc/
.next_sibling and .previous_sibling
You can use .next_sibling and .previous_sibling to navigate between page elements that are on the same level of the parse tree:
from bs4 import BeautifulSoup
html = """
<div class="media-body pt5 pb10">
<div class="mb15">
<span class="map-item-city block mb0 colorgreen">Bronisze</span>
<p class="small mb20"> </p>
<p class="size16">Poznańska 98<br>tel. 22 721-56-70</p>
</div>
<div class="colorblack"><strong>Godziny otwarcia</strong></div>
<div class="mb20 size16">Pn.-Pt. 07.30-15.30</div>
<div class="mb15 ">
"""
soup=BeautifulSoup(html, "lxml")
for html_tag_div in soup.find_all('div', class_ = "media-body pt5 pb10"):
for html_tag_div_1 in html_tag_div.find_all('div', class_ = "mb15"):
for html_tag_2 in html_tag_div_1.find_all("p", class_ = "size16"):
for html_tag_3 in html_tag_2.find("br").previous_siblings:
print(html_tag_3.get_text())
I am trying to select radio buttons and select value in selectCls but facing problem with my code please help me on the same
Thanks in advance..
My VBA Code is:
Browser.document.getElementById("ddlCycleID").SelectCls = "Cycle 274 (21 Jun 2017 - 31 Jul 2017)"
Browser.document.getElementById("chkproc").Checked = True
Browser.document.getElementById("btnSubmit").Click
And HTML Source code is:
<td>
<input value="chkproc" name="Module" type="radio" id="chkproc" checked="checked" tabindex="1" onclick="toggle(this)"> Processing Module
<td>
<td>
<input value="chkquery" name="Module" type="radio" id="chkquery" tabindex="2" onclick="toggle(this)"> Other Modules
<td>
<td>
<select name="ddlCycleID" id="ddlCycleID" tabindex="4" class="SelectCls" style="width:180px;">
<option value="Select Cycle">Select Cycle</option>
<option value="274">Cycle 274 (21 Jun 2017 - 31 Jul 2017)</option>
<option value="273">Cycle 273 (29 Jun 2017 - 29 Jun 2017)</option>
</select>
<td>
getting below error msg
Run-time error '424'
object required
in this code:
Browser.document.getElementById("ddlCycleID").SelectCls = "Cycle 274 (21 Jun 2017 - 31 Jul 2017)"
showing this "="" when i place cursor on this code