Python: Extract text and element selectors from html elements - python

Given something like the following html:
<div>
<div>
<meta ... />
<img />
</div>
<div id="main">
<p class="foo">Hello, World</p>
<div>
<div class="bar">Hey, there!</div>
</div>
</div>
</div>
How would I go about selecting only the elements that have text and outputting a generated, unique css selector for said element?
For this example, that would be:
# can be even more specific if there are other .foo's
------
[ |
{ "html": "Hello, World", "selector": ".foo"},
{ "html": "Hey, there!", "selector": ".bar" }
]
Was playing with BeautifulSoup and html_sanitizer but wasn't getting great results.

This should be a piece of cake with BeautifulSoup
from bs4 import BeautifulSoup
html = """
<div>
<div>
<meta ... />
<img />
</div>
<div id="main">
<p class="foo">Hello, World</p>
<div>
<div class="bar">Hey, there!</div>
</div>
</div>
</div>
"""
soup = BeautifulSoup(html, 'html.parser')
results = []
for element in soup.find_all(string=True):
parent = element.parent
while parent and not (parent.has_attr('id') or parent.has_attr('class')):
parent = parent.parent
if parent and element.strip() != '':
if parent.has_attr('id'):
results.append({
"html": element.strip(),
"selector": '#' + parent['id']
})
elif parent.has_attr('class'):
results.append({
"html": element.strip(),
"selector": list(map(lambda cls: '.' + cls, parent['class']))
})
print(results)

Came up with this by directing Copilot and ChatGPT:
def get_css_selector(element):
selector = element.name
if element.has_attr("id"):
selector += "#" + element["id"]
else:
classes = element.get("class", [])
if classes:
selector += "." + ".".join(classes)
else:
parent = element.find_parent()
if parent:
parent_selector = get_css_selector(parent)
selector = parent_selector + " > " + selector
index = 1
for sibling in element.previous_siblings:
if sibling.name == element.name:
index += 1
selector += f":nth-of-type({index})"
return selector
def get_html_segments(page):
soup = BeautifulSoup(page, "html.parser")
html_segments = []
for tag in soup.find_all():
if tag.name not in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "a", "span", "div"]:
continue
if len(tag.contents) == 1 and tag.contents[0].name is None:
html_segments.append(
{"text": str(tag.contents), "css_selector": get_css_selector(tag)}
)
return html_segments
Please let me know if someone comes up with something more effective.
At some point, it'd be cool to take the text from block elements and have inline text elements turned to their innerText.

Related

How to parse the drop down list and get the all the links for the pdf using Beautiful Soup in Python?

I'm trying to scrape the pdf links from the drop down this website. I want to scrape just the Guideline Values (CVC) drop down. Following is the code that i used but did not succeed
import requests
from bs4 import BeautifulSoup
req_ses = requests.Session()
igr_get_base_response = req_ses.get("https://igr.karnataka.gov.in/english#")
soup = BeautifulSoup(igr_get_base_response.text)
def matches_block(tag):
return matches_dropdown(tag) and tag.find(matches_text) != None
def matches_dropdown(tag):
return tag.name == 'li' and tag.has_attr('class') and 'dropdown-toggle' in tag['class']
def matches_text(tag):
return tag.name == 'a' and tag.get_text()
for li in soup.find_all(matches_block):
for ul in li.find_all('ul', class_='dropdown-toggle'):
for a in ul.find_all('a'):
if a.has_attr('href'):
print (a['href'])
any suggestion would be great help !
Edit: Adding part of HTML below:
<div class="collapse navbar-collapse">
<ul class="nav navbar-nav">
<li class="">
<i class="fa fa-home"> </i>
</li>
<li>
<a class="dropdown-toggle" data-toggle="dropdown" title="RTI Act">RTI Act <b class="caret"></b></a>
<ul class="dropdown-menu multi-level">
<!-- <li> -->
<li class="">
<a href=" https://igr.karnataka.gov.in/page/RTI+Act/Yadagiri+./en " title="Yadagiri .">Yadagiri .
</a>
</li>
<!-- </li> -->
<!-- <li>
I have tried to get the links of all the PDF files that you need.
I have selected the <a> tags whose href matches with the pattern - see patt in code. This pattern is common to all the PDF files that you need.
Now you have all the links to the PDF files in links list.
from bs4 import BeautifulSoup
import requests
url = 'https://igr.karnataka.gov.in/english#'
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'html.parser')
a = soup.find('a', attrs= {'title': 'Guidelines Value (CVC)'})
lst = a.parent()
links = []
patt = 'https://igr.karnataka.gov.in/storage/pdf-files/Guidelines Value/'
for i in lst:
temp = i.find('a')
if temp:
if patt in temp['href']:
links.append(temp['href'].strip())
I have first find ul_tag in which all the data is available now from that find_all method on a where it contains .pdf href with attrs having target:_blank so from it we can extract only .pdf links
from bs4 import BeautifulSoup
import requests
res=requests.get("https://igr.karnataka.gov.in/english#")
soup=BeautifulSoup(res.text,"lxml")
ul_tag=soup.find("ul",class_="nav navbar-nav")
a_tag=ul_tag.find_all("a",attrs={"target":"_blank"})
for i in a_tag:
print(i.get_text(strip=True))
print(i.get("href").strip())
Output:
SRO Chikkaballapur
https://igr.karnataka.gov.in/storage/pdf-files/Guidelines Value/chikkaballapur sro.pdf
SRO Gudibande
https://igr.karnataka.gov.in/storage/pdf-files/Guidelines Value/gudibande sro.pdf
SRO Shidlaghatta
https://igr.karnataka.gov.in/storage/pdf-files/Guidelines Value/shidlagatta sro.pdf
SRO Bagepalli
....
So, i used the following approach to complete the above mentioned part:
def make_sqlite_dict_from_parsed_row(district_value, sro_value, pdf_file_link):
sqlite_dict = {
"district_value": district_value,
"sro_value": sro_value,
"pdf_file_link": pdf_file_link.strip().replace(' ', '%20'),
"status": "PENDING"
}
sqlite_dict['hsh'] = get_hash(sqlite_dict, IGR_SQLITE_HSH_TUP)
return sqlite_dict
li_element_list = home_response_soup.find_all('li', {'class': 'dropdown-submenu'})
parsed_row_list=[]
for ele in li_element_list:
district_value = ele.find('a', {'class': 'dropdown-toggle'}).get_text().strip()
sro_pdf_a_tags = ele.find_all('a', attrs={'target': '_blank'})
if len(sro_pdf_a_tags) >=1:
for sro_a_tag in sro_pdf_a_tags:
sqlite_dict = make_sqlite_dict_from_parsed_row(
district_value,
sro_a_tag.get_text(strip=True),
sro_a_tag.get('href')
)
parsed_row_list.append(sqlite_dict)
else:
print("District: ", district_value, "'s pdf is corrupted")
this will give a proper_pdf_link, sro_name and disctrict_name

How to select second div tag with same classname?

I'm trying to select the the second div tag with the info classname, but with no success using bs4 find_next. How Do you go about selecting the text inside the second div tag that share classname?
[<div class="info">
<a href="/clubs/12/Manchester-United/overview">
Manchester United<span class="playerClub badge-20 t1"></span>
</a>
</div>
<div class="info">Defender</div>
<div class="info">
<a href="/clubs/12/Manchester-United/overview">
Manchester United<span class="playerClub badge-20 t1"></span>
</a>
</div>
<div class="info">Defender</div>]
Here is what I have tried
from bs4 import BeautifulSoup
import requests
players_url =['http://www.premierleague.com//players/13559/Axel-Tuanzebe/stats']
# this is dict where we store all information:
players = {}
for url in players_url:
player_page = requests.get(url)
cont = soup(player_page.content, 'lxml')
data = dict((k.contents[0].strip(), v.get_text(strip=True)) for k, v in zip(cont.select('.topStat span.stat, .normalStat span.stat'), cont.select('.topStat span.stat > span, .normalStat span.stat > span')))
club = {"Club" : cont.find('div', attrs={'class' : 'info'}).get_text(strip=True)}
position = {"Position": cont.find_next('div', attrs={'class' : 'info'})}
players[cont.select_one('.playerDetails .name').get_text(strip=True)] = data
print(position)
You can try follows:
clud_ele = cont.find('div', attrs={'class' : 'info'})
club = {"Club" : clud_ele.get_text(strip=True)}
position = {"Position": clud_ele.find_next('div', attrs={'class' : 'info'})}

Beautiful Soup Conditional Query

I am new to Beautiful Soup.
I need to get data from HTML file.
<div class="ques_ans_block">
<div class="question">
<p>is this correct ?</p>
<div>
<p class="answer"></p>
<div class="moreinfo" style="display: block;">
<p class="answer"> <p>
<p class="answer"></p>
</div>
</div>
condition is , there can be "moreinfo" div present or abscent.
so i need to find question and answer(including answer from "moreinfo" if present) innertext for each ques_ans_block ?
This will give output as json containing Question, answer and FaqID.
import bs4
import json
import codecs
arrayList = []
bsp = bs4.BeautifulSoup(open('input.html'))
ques_ans_block = bsp.find_all("div", {"class": "ques_ans_block"})
s = ""
count = 1
for i in ques_ans_block:
data = {}
q = i.select('.question')
for a in q:
s+=a.text+"\n"
for a in q:
a.extract()
data["Question"] = s
del i['.question']
v = ""
a = i.select('p')
for a in a:
v+=a.text+"\n"
a = i.select('li')
for a in a:
v+=a.text+"\n"
data["Answer"] = v
data["FaqId"] = count
print "\n"
arrayList.append(data)
count = count + 1
s = ""
#print arrayList
with codecs.open('output.json','wt','utf-8') as outfile:
json.dump(arrayList, outfile, indent=4)

It's possible to skip tags if they have been already parsed and coninue with the newer ones in python?

Let's suppose that I have the following HTML:
<div class="class1">
<div class="some multiple classes here">
<div class="some multiple classes here">
<ul class="other classes">
<li>
<div class="random">some text</div>
<div class="random1">some text1</div>
<div class="random2">some tex2t</div>
</li>
<li>
<div class="random">some text3</div>
<div class="random1">some text4</div>
<div class="random2">some text5</div>
</li>
<li>
<div class="random">some text6</div>
<div class="random1">some text7</div>
<div class="random2">some text8</div>
</li>
<!-- here can appear more <li></li> elements -->
</ul>
</div>
</div>
</div>
Now, in python, I made a function which is adding each message from the divs inside li tags in a list. So my lists will look like this:
messages_list = ['some text some text1 some text2', 'some text3 some text4 some text5', 'and so on..']
The function that I created uses selenium webdriver to get the content from the HTML and it looks like this:
def writeToChatTest(CHAT_URL):
mydriver.get(CHAT_URL)
message = "Some message to test"
xpaths = {
'textArea': "//*[#id='ipsTabs_elChatTabs_chatroom_panel']/div[1]/div[1]/div/div/div[1]/textarea",
'submitMessage': "//*[#id='ipsTabs_elChatTabs_chatroom_panel']/div[1]/div[1]/div/div/div[3]/button"
}
time.sleep(5)
rst_messages_list = []
lis = mydriver.find_elements_by_xpath('//ul[#class="ipsList_reset cChatContainer"]/li')
for li in lis:
rst_messages_list.append(li.text)
for unique_message in rst_messages_list:
if "word" in unique_message:
mydriver.find_element_by_xpath(xpaths['textArea']).clear()
mydriver.find_element_by_xpath(xpaths['textArea']).send_keys(unique_message[0] + ": " + message)
mydriver.find_element_by_xpath(xpaths['submitMessage']).click()
Now, the question that I'm asking is: is there any way of storing the last li tag parsed and check if there's a new one (or more) ? More, how can I make this check to be made continously?
The problem is that once I parsed the whole li tags, I'm not able to retrieve the new ones (it's a chat, so new lis appear pretty often).
Each element has an unique ID so you could store the last li processed but it complicates things.
I would something like:
def writeToChatTest(CHAT_URL):
mydriver.get(CHAT_URL)
message = "Some message to test"
xpaths = {
'textArea': "//*[#id='ipsTabs_elChatTabs_chatroom_panel']/div[1]/div[1]/div/div/div[1]/textarea",
'submitMessage': "//*[#id='ipsTabs_elChatTabs_chatroom_panel']/div[1]/div[1]/div/div/div[3]/button"
}
parsed_messages = []
keepRunning = True
while keepRunning:
time.sleep(5)
lis = mydriver.find_elements_by_xpath('//ul[#class="ipsList_reset cChatContainer"]/li')
rst_messages_list = []
for li in lis:
if li.id() not in parsed_messages:
'''this will end nicely your test if the message 'end selenium test' is being sent in chat'''
if li.text == 'end selenium test':
keepRunning = False
rst_messages_list.append(li.text)
parsed_messages.append(li.id())
for unique_message in rst_messages_list:
if "word" in unique_message:
mydriver.find_element_by_xpath(xpaths['textArea']).clear()
mydriver.find_element_by_xpath(xpaths['textArea']).send_keys(unique_message[0] + ": " + message)
mydriver.find_element_by_xpath(xpaths['submitMessage']).click()

Python + selenium: extract variable quantity of paragraphs between titles

Fellows, assuming the html below how can extract the paragraphs <p> who belongs to the tile <h3>.
<!DOCTYPE html>
<html>
<body>
...
<div class="main-div">
<h3>Title 1</h3>
<p></p>
<h3>Title 2</h3>
<p></p>
<p></p>
<p></p>
<h3>Title 3</h3>
<p></p>
<p></p>
...
</div>
</body>
As you can see both <h3> and <p> tags are children of the <div> tag but they have no class or id that makes possible to identify them and say that "Title 1" has 1 paragraph, title 2 has 3 paragraphs, title 3 has two paragraphs and so on. I can't see a way to tie the paragraph to the title...
I'm trying to do it using Python 2.7 + selenium. But I'm not sure that I'm working with the right tools, maybe you can suggest the solution or any different combinations like Beautifulsoup, urllib2...
Any suggestion/direction will be very appreciated!
UPDATE
After the brilliant solution pointed by #JustMe I came up with the solution below, hope it helps someone else or if someone can improve it to pythonic. I coming from c/c++/java/perl world so always I hit the wall :)
import bs4
page = """
<!DOCTYPE html>
<html>
<body>
...
<div class="maincontent-block">
<h3>Title 1</h3>
<p>1</p>
<p>2</p>
<p>3</p>
<h3>Title 2</h3>
<p>2</p>
<p>3</p>
<p>4</p>
<h3>Title 3</h3>
<p>7</p>
<p>9</p>
...
</div>
</body>
"""
page = bs4.BeautifulSoup(page, "html.parser")
div = page.find('div', {'class':"maincontent-block"})
mydict = {}
# write to the dictionary
for tag in div.findChildren():
if (tag.name == "h3"):
#print(tag.string)
mydict[tag.string] = None
nextTags = tag.findAllNext()
arr = [];
for nt in nextTags:
if (nt.name == "p"):
arr.append(nt.string)
mydict[tag.string] = arr
elif (nt.name == "h3"):
arr = []
break
# read from dictionary
arrKeys = []
for k in mydict:
arrKeys.append(k)
arrKeys.sort()
for k in arrKeys:
print k
for v in mydict[k]:
print v
It's easy to be done using BeautifulSoup
import bs4
page = """
<!DOCTYPE html>
<html>
<body>
...
<div class="main-div">
<h3>Title 1</h3>
<p></p>
<h3>Title 2</h3>
<p></p>
<p></p>
<p></p>
<h3>Title 3</h3>
<p></p>
<p></p>
...
</div>
</body>
"""
page = bs4.BeautifulSoup(page)
h3_tag = page.div.find("h3").string
print(h3_tag)
>>> u'Title 1'
h3_tag.find_next_siblings("p")
>>> [<p></p>, <p></p>, <p></p>, <p></p>, <p></p>, <p></p>]
len(h3_tag.find_next_siblings("p"))/2
>>> 3
Ok, since You want separated count of paragraphs i came up with this, crude thing.
h_counters = []
count = -1
for child in page.div.findChildren():
if "<h3>" in str(child):
h_counters.append(count)
count = 0
else:
count += 1
h_counters.append(count)
h_counters = h_counters[1:]
print (h_counters)
>> [1, 3, 2]

Categories