Parse text divided by <br> but not inside <span> - python

I can't figure out how to parse this type of data:
<div id="tabs-1" class="ui-tabs-panel ui-widget-content ui-corner-bottom">
<strong><span itemprop="name">MOS-SCHAUM</span></strong><br>
<span itemprop="description">Antistatická pena čierna na IO 300x300x6mm</span>
<br>RoHS: Áno
<br>Obj.číslo: 13291<br>
</div>
There can be many <span> tags inside the snippet - I don't want to get them. I want only those, which are not inside <span> tags.
So the result would be:
{'RoHS':'Áno',
'Obj.číslo': '13291'}
I was considering .contents but it's a very unpredictable which elements will be on which index.
Do you know how to do that?
EDIT:
Even if I try this:
detail_table = soup.find('div',id="tabs-1")
itemprops = detail_table.find_all('span',itemprop=re.compile('.+'))
for item in itemprops:
data[item['itemprop']]=item
contents = detail_table.contents[-1].contents[-1].contents[-1].contents
for i,c in enumerate(contents):
print c
print '---'
I get this:
RoHS: Áno
# 1st element
---
<br>Obj.�íslo: 68664<br>
</br></br> # 2st element
---
EDIT2: I've just find out one solution but it's not very nice. There must be a more elegant solution:
def get_data(url):
data = {}
soup = get_soup(url)
""" TECHNICAL INFORMATION """
tech_par_table = soup.find('div',id="tabs-2")
trs = tech_par_table.find_all('tr')
for tr in trs:
tds = tr.find_all('td')
parameter = tds[0].text
value = tds[1].text
data[parameter]=value
""" DETAIL """
detail_table = soup.find('div',id="tabs-1")
itemprops = detail_table.find_all('span',itemprop=re.compile('.+'))
for item in itemprops:
data[item['itemprop'].replace('\n','').replace('\t','').strip()]=item.text.
contents = detail_table.contents[-1].contents[-1].contents[-1].contents
for i,c in enumerate(contents):
if isinstance(c,bs4.element.NavigableString):
splitted = c.split(':')
data[splitted[0]]=splitted[1].replace('\n','').replace('\t','').strip()
if isinstance(c,bs4.element.Tag):
splitted = c.text.split(':')
data[splitted[0]]=splitted[1].replace('\n','').replace('\t','').strip()

First you need to get all br tag and use the .next_element attribute to get whatever was parsed immediately after each br tag; here your text.
d = {}
for br in soup.find_all('br'):
text = br.next_element.strip()
if text:
arr = text.split(':')
d[arr[0]] = arr[1].strip()
print(d)
yields:
{'Obj.číslo': '13291', 'RoHS': 'Áno'}

Related

beautifulsoup how to recombine words

Some of the words outputted are split when running this code. Like the word "tolerances" is split into "tole rances". I looked at the html source and it seems that's how the page was created.
There are also many other places where the word is split. How do I recombine them before writing to text?
import requests, codecs
from bs4 import BeautifulSoup
from bs4.element import Comment
path='C:\\Users\\jason\\Google Drive\\python\\'
def tag_visible(element):
if element.parent.name in ['sup']:
return False
if isinstance(element, Comment):
return False
return True
ticker = 'TSLA'
quarter = '18Q2'
mark1= 'ITEM 1A'
mark2= 'UNREGISTERED SALES'
url_new='https://www.sec.gov/Archives/edgar/data/1318605/000156459018019254/tsla-10q_20180630.htm'
def get_text(url,mark1,mark2):
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
for hr in soup.select('hr'):
hr.find_previous('p').extract()
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
text=u" ".join(t.strip() for t in visible_texts)
return text[text.find(mark1): text.find(mark2)]
text = get_text(url_new,mark1,mark2)
file=codecs.open(path + "test.txt", 'w', encoding='utf8')
file.write (text)
file.close()
You are dealing with HTML formatted with Microsoft Word. Don't extract text and try to process it without that context.
The section you want to process is clearly delineated with <a name="..."> tags, lets start with selecting all elements with the <a name="ITEM_1A_RISK_FACTORS"> marker, all the way up to but not including the <a name="ITEM2_UNREGISTERED_SALES"> marker:
def sec_section(soup, item_name):
"""iterate over SEC document paragraphs for the section named item_name
Item name must be a link target, starting with ITEM
"""
# ask BS4 to find the section
elem = soup.select_one('a[name={}]'.format(item_name))
# scan up to the parent text element
# html.parser does not support <text> but lxml does
while elem.parent is not soup and elem.parent.name != 'text':
elem = elem.parent
yield elem
# now we can yield all next siblings until we find one that
# is also contains a[name^=ITEM] element:
for elem in elem.next_siblings:
if not isinstance(elem, str) and elem.select_one('a[name^=ITEM]'):
return
yield elem
This function gives us all child nodes from the <text> node in the HTML document that start at a paragraph containing a specific link target, all the way through to the next link target that names an ITEM.
Next, the usual Word cleanup task is to remove <font> tags, style attributes:
def clean_word(elem):
if isinstance(elem, str):
return elem
# remove last-rendered break markers, non-rendering but messy
for lastbrk in elem.select('a[name^=_AEIOULastRenderedPageBreakAEIOU]'):
lastbrk.decompose()
# remove font tags and styling from the document, leaving only the contents
if 'style' in elem.attrs:
del elem.attrs['style']
for e in elem: # recursively do the same for all child nodes
clean_word(e)
if elem.name == 'font':
elem = elem.unwrap()
return elem
The Tag.unwrap() method is what'll most help your case, as the text is divided up almost arbitrarily by <font> tags.
Now it's suddenly trivial to extract the text cleanly:
for elem in sec_section(soup, 'ITEM_1A_RISK_FACTORS'):
clean_word(elem)
if not isinstance(elem, str):
elem = elem.get_text(strip=True)
print(elem)
This outputs, among the rest of the text:
•that the equipment and processes which we have selected for Model 3 production will be able to accurately manufacture high volumes of Model 3 vehicles within specified design tolerances and with high quality;
The text is now properly joined up, no re-combining required any more.
The whole section is still in a table but clean_word() cleaned this up now to the much more reasonable:
<div align="left">
<table border="0" cellpadding="0" cellspacing="0">
<tr>
<td valign="top">
<p> </p></td>
<td valign="top">
<p>•</p></td>
<td valign="top">
<p>that the equipment and processes which we have selected for Model 3 production will be able to accurately manufacture high volumes of Model 3 vehicles within specified design tolerances and with high quality;</p></td></tr></table></div>
so you can use smarter text extraction techniques to further ensure a clean text conversion here; you could convert such bullet tables to a * prefix, for example:
def convert_word_bullets(soup, text_bullet="*"):
for table in soup.select('div[align=left] > table'):
div = table.parent
bullet = div.find(string='\u2022')
if bullet is None:
# not a bullet table, skip
continue
text_cell = bullet.find_next('td')
div.clear()
div.append(text_bullet + ' ')
for i, elem in enumerate(text_cell.contents[:]):
if i == 0 and elem == '\n':
continue # no need to include the first linebreak
div.append(elem.extract())
In addition, you probably want to skip the page breaks too (a combination of <p>[page number]</p> and <hr/> elements), if you run
for pagebrk in soup.select('p ~ hr[style^=page-break-after]'):
pagebrk.find_previous_sibling('p').decompose()
pagebrk.decompose()
This is more explicit than your own version, where you remove all <hr/> elements and preceding <p> element regardless of whether they are actually siblings.
Execute both before cleaning up your Word HTML. Combined with your function that together becomes:
def get_text(url, item_name):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
for pagebrk in soup.select('p ~ hr[style^=page-break-after]'):
pagebrk.find_previous_sibling('p').decompose()
pagebrk.decompose()
convert_word_bullets(soup)
cleaned_section = map(clean_word, sec_section(soup, item_name))
return ''.join([
elem.get_text(strip=True) if elem.name else elem
for elem in cleaned_section])
text = get_text(url, 'ITEM_1A_RISK_FACTORS')
with open(os.path.join(path, 'test.txt'), 'w', encoding='utf8') as f:
f.write(text)
This page markup is really bad. You will need to remove excess tags to fix your issue. Luckily for you, beautifulsoup can do the heavy-lifting. The code below will remove all font tags.
soup = BeautifulSoup(html.text, 'html.parser')
for font in soup.find_all('font'):
font.unwrap()

BeautifulSoup - combine consecutive tags

I have to work with the messiest HTML where individual words are split into separate tags, like in the following example:
<b style="mso-bidi-font-weight:normal"><span style='font-size:14.0pt;mso-bidi-font-size:11.0pt;line-height:107%;font-family:"Times New Roman",serif;mso-fareast-font-family:"Times New Roman"'>I</span></b><b style="mso-bidi-font-weight:normal"><span style='font-family:"Times New Roman",serif;mso-fareast-font-family:"Times New Roman"'>NTRODUCTION</span></b>
That's kind of hard to read, but basically the word "INTRODUCTION" is split into
<b><span>I</span></b>
and
<b><span>NTRODUCTION</span></b>
having the same inline properties for both span and b tags.
What's a good way to combine these? I figured I'd loop through to find consecutive b tags like this, but am stuck on how I'd go about merging the consecutive b tags.
for b in soup.findAll('b'):
try:
if b.next_sibling.name=='b':
## combine them here??
except:
pass
Any ideas?
EDIT:
Expected output is the following
<b style="mso-bidi-font-weight:normal"><span style='font-family:"Times New Roman",serif;mso-fareast-font-family:"Times New Roman"'>INTRODUCTION</span></b>
The solution below combines text from all the selected <b> tags into one <b> of your choice and decomposes the others.
If you only want to merge the text from consecutive tags follow Danny's approach.
Code:
from bs4 import BeautifulSoup
html = '''
<div id="wrapper">
<b style="mso-bidi-font-weight:normal">
<span style='font-size:14.0pt;mso-bidi-font-size:11.0pt;line-height:107%;font-family:"Times New Roman",serif;mso-fareast-font-family:"Times New Roman"'>I</span>
</b>
<b style="mso-bidi-font-weight:normal">
<span style='font-family:"Times New Roman",serif;mso-fareast-font-family:"Times New Roman"'>NTRODUCTION</span>
</b>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
container = soup.select_one('#wrapper') # it contains b tags to combine
b_tags = container.find_all('b')
# combine all the text from b tags
text = ''.join(b.get_text(strip=True) for b in b_tags)
# here you choose a tag you want to preserve and update its text
b_main = b_tags[0] # you can target it however you want, I just take the first one from the list
b_main.span.string = text # replace the text
for tag in b_tags:
if tag is not b_main:
tag.decompose()
print(soup)
Any comments appreciated.
Perhaps you could check if the b.previousSibling is a b tag, then append the inner text from the current node into that. After doing this - you should be able to remove the current node from the tree with b.decompose.
The approach I have used for this problem is to insert one element inside the other, then unwrap() it, which will preserve all nested text and tags -- unlike approaches using the text contents of the elements.
For example:
for b in soup.find_all('b'):
prev = b.previous_sibling
if prev and prev.name == 'b': # Any conditions needed to decide to merge
b.insert(0, prev) # Move the previous element inside this one
prev.unwrap() # Unwrap <b><b>prev</b> b</b> into <b>prev b</b>
Note the use of previous_sibling instead of next_sibling so that we don't modify subsequent parts of the soup that we are about to iterate over.
Then we might want to repeat the process with <span> to achieve the final result. Perhaps also check b['style'] == prev['style'] if that condition is desired for merging.
The adjacent answer combines only text tags, without preserving nested tags, such as <i>.
The following code does this.
For example, for this html:
<div>
<p>A<i>b</i>cd1, <i>a</i><b><i>b</i></b><i>cd2</i> abcd3 <i>ab</i></p>
<p>cd4 <i>a</i><i>bsd5</i> <i>ab<span>cd6</span></i></p>
</div>
the result will be:
<div>
<p>A<i>b</i>cd1, <i>a<b>b</b>cd2</i> abcd3 <i>ab</i></p>
<p>cd4 <i>absd5 ab<span>cd6</span></i></p>
</div>
In the ignoring_tags_names variable, you can set which tags are considered nested and ignored when merging. Any other tags will break the merging chain.
In the re_symbols_ignore variable, you can set text characters between the same tags to be ignored when concatenating. Any other characters will break the merging chain.
You can also specify a check for the identity of tag attributes. But their order is not checked. {class: ['a', 'b']} and {class: ['b', 'a']} are considered different and tags are not combined.
import re
from bs4 import BeautifulSoup, NavigableString
def find_and_combine_tags(soup, init_tag_name: str, init_tag_attrs: dict = None or {}):
def combine_tags(tag, tags: list):
# appending the tag chain to the first tag
for t in tags:
tag.append(t)
# unwrapping them
for t in tag.find_all(init_tag_name):
if t.name == init_tag_name and t.attrs == init_tag_attrs:
t.unwrap()
def fill_next_siblings(tag, init_tag_name: str, ignoring_tags_names: list) -> list:
next_siblings = []
for t in tag.next_siblings:
if isinstance(t, NavigableString) and not re_symbols_ignore.match(t):
next_siblings.append(t)
elif isinstance(t, NavigableString) and re_symbols_ignore.match(t):
next_siblings.append(t)
elif t.name in ignoring_tags_names and t.attrs == init_tag_attrs: # also checking the tag attrs
next_siblings.append(t)
else:
# filling `next_siblings` until another tag met
break
has_other_tag_met = False
for t in next_siblings:
if t.name == init_tag_name and t.attrs == init_tag_attrs:
has_other_tag_met = True
break
# removing unwanted tags on the tail of `next_siblings`
if has_other_tag_met:
while True:
last_tag = next_siblings[-1]
if isinstance(last_tag, NavigableString):
next_siblings.pop()
elif last_tag.name != init_tag_name and last_tag.attrs != init_tag_attrs:
next_siblings.pop()
else:
break
return next_siblings
# Ignore nested tags names
if init_tag_name in ['i', 'b', 'em']:
ignoring_tags_names = ['i', 'b', 'em']
elif init_tag_name in ['div']:
# A block tags can have many nested tags
ignoring_tags_names = ['div', 'p', 'span', 'a']
else:
ignoring_tags_names = []
# Some symbols between same tags can add into them. Because they don't changing of font style.
if init_tag_name == 'i':
# Italic doesn't change the style of some characters (spaces, period, comma), so they can be combined
re_symbols_ignore = re.compile(r'^[\s.,-]+$')
elif init_tag_name == 'b':
# Bold changes the style of all characters
re_symbols_ignore = re.compile(r'^[\s]+$')
elif init_tag_name == 'div':
# Here should be careful with merging, because a html can have some `\n` between block tags (like `div`s)
re_symbols_ignore = re.compile(r'^[\s]+$')
else:
re_symbols_ignore = None
all_wanted_tags = soup.find_all(init_tag_name)
if all_wanted_tags:
tag_groups_to_combine = []
tag = all_wanted_tags[0]
last_tag = tag
while True:
tags_to_append = fill_next_siblings(tag, init_tag_name, ignoring_tags_names)
if tags_to_append:
tag_groups_to_combine.append((tag, tags_to_append)) # the first tag and tags to append
# looking for a next tags group
last_tag = tags_to_append[-1] if tags_to_append else tag
for tag in all_wanted_tags:
if tag.sourceline > last_tag.sourceline \
or (tag.sourceline == last_tag.sourceline and tag.sourcepos > last_tag.sourcepos):
break
if last_tag.sourceline == all_wanted_tags[-1].sourceline and last_tag.sourcepos == last_tag.sourcepos:
break
last_tag = tag
for first_tag, tags_to_append in tag_groups_to_combine:
combine_tags(first_tag, tags_to_append)
return soup

Combine multiple tags with lxml

I have an html file which looks like:
...
<p>
<strong>This is </strong>
<strong>a lin</strong>
<strong>e which I want to </strong>
<strong>join.</strong>
</p>
<p>
2.
<strong>But do not </strong>
<strong>touch this</strong>
<em>Maybe some other tags as well.</em>
bla bla blah...
</p>
...
What I need is, if all the tags in a 'p' block are 'strong', then combine them into one line, i.e.
<p>
<strong>This is a line which I want to join.</strong>
</p>
Without touching the other block since it contains something else.
Any suggestions? I am using lxml.
UPDATE:
So far I tried:
for p in self.tree.xpath('//body/p'):
if p.tail is None: #no text before first element
children = p.getchildren()
for child in children:
if len(children)==1 or child.tag!='strong' or child.tail is not None:
break
else:
etree.strip_tags(p,'strong')
With these code I was able to strip off the strong tag in the part desired, giving:
<p>
This is a line which I want to join.
</p>
So now I just need a way to put the tag back in...
I was able to do this with bs4 (BeautifulSoup):
from bs4 import BeautifulSoup as bs
html = """<p>
<strong>This is </strong>
<strong>a lin</strong>
<strong>e which I want to </strong>
<strong>join.</strong>
</p>
<p>
<strong>But do not </strong>
<strong>touch this</strong>
</p>"""
soup = bs(html)
s = ''
# note that I use the 0th <p> block ...[0],
# so make the appropriate change in your code
for t in soup.find_all('p')[0].text:
s = s+t.strip('\n')
s = '<p><strong>'+s+'</strong></p>'
print s # prints: <p><strong>This is a line which I want to join.</strong></p>
Then use replace_with():
p_tag = soup.p
p_tag.replace_with(bs(s, 'html.parser'))
print soup
prints:
<html><body><p><strong>This is a line which I want to join.</strong></p>
<p>
<strong>But do not </strong>
<strong>touch this</strong>
</p></body></html>
I have managed to solve my own problem.
for p in self.tree.xpath('//body/p'):
if p.tail is None: # some conditions specifically for my doc
children = p.getchildren()
if len(children)>1:
for child in children:
#if other stuffs present, break
if child.tag!='strong' or child.tail is not None:
break
else:
# If not break, we find a p block to fix
# Get rid of stuffs inside p, and put a SubElement in
etree.strip_tags(p,'strong')
tmp_text = p.text_content()
p.clear()
subtext = etree.SubElement(p, "strong")
subtext.text = tmp_text
Special thanks to #Scott who helps me come down to this solution. Although I cannot mark his answer correct, I have no less appreciation to his guidance.
Alternatively, you can use more specific xpath to get the targeted p elements directly :
p_target = """
//p[strong]
[not(*[not(self::strong)])]
[not(text()[normalize-space()])]
"""
for p in self.tree.xpath(p_target):
#logic inside the loop can also be the same as your `else` block
content = p.xpath("normalize-space()")
p.clear()
strong = etree.SubElement(p, "strong")
strong.text = content
brief explanation about xpath being used :
//p[strong] : find p element, anywhere in the XML/HTML document, having child element strong...
[not(*[not(self::strong)])] : ..and not having child element other than strong...
[not(text()[normalize-space()])] : ..and not having non-empty text node child.
normalize-space() : get all text nodes from current context element, concatenated with consecutive whitespaces normalized to single space

How to extract tags from HTML using Beautifulsoup in Python

I am trying to parse through an HTML page which simplified looks like this:
<div class="anotherclass part"
<a href="http://example.com" >
<div class="column abc"><strike>£3.99</strike><br>£3.59</div>
<div class="column def"></div>
<div class="column ghi">1 Feb 2013</div>
<div class="column jkl">
<h4>A title</h4>
<p>
<img class="image" src="http://example.com/image.jpg">A, List, Of, Terms, To, Extract - 1 Feb 2013</p>
</div>
</a>
</div>
I am a beginner at coding python and I have read and re-read the beautifulsoup documentation at http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html
I have got this code:
from BeautifulSoup import BeautifulSoup
with open("file.html") as fp:
html = fp.read()
soup = BeautifulSoup(html)
parts = soup.findAll('a', attrs={"class":re.compile('part'), re.IGNORECASE} )
for part in parts:
mypart={}
# ghi
mypart['ghi'] = part.find(attrs={"class": re.compile('ghi')} ).string
# def
mypart['def'] = part.find(attrs={"class": re.compile('def')} ).string
# h4
mypart['title'] = part.find('h4').string
# jkl
mypart['other'] = part.find('p').string
# abc
pattern = re.compile( r'\&\#163\;(\d{1,}\.?\d{2}?)' )
theprices = re.findall( pattern, str(part) )
if len(theprices) == 2:
mypart['price'] = theprices[1]
mypart['rrp'] = theprices[0]
elif len(theprices) == 1:
mypart['price'] = theprices[0]
mypart['rrp'] = theprices[0]
else:
mypart['price'] = None
mypart['rrp'] = None
I want to extract any text from the classes def and ghi which I think my script does correctly.
I also want to extract the two prices from abc which my script does in a rather clunky fashion at the moment. Sometimes there are two prices, sometimes one and sometimes none in this part.
Finally I want to extract the "A, List, Of, Terms, To, Extract" part from class jkl which my script fails to do. I thought getting the string part of the p tag would work but I cannot understand why it does not. The date in this part always matches the date in class ghi so it should be easy to replace/remove it.
Any advice? Thank-you!
First, if you add convertEntities=bs.BeautifulSoup.HTML_ENTITIES to
soup = bs.BeautifulSoup(html, convertEntities=bs.BeautifulSoup.HTML_ENTITIES)
then the html entities such as £ will be converted to their corresponding unicode character, such as £. This will allow you to use a simpler regex to identify the prices.
Now, given part, you can find the text content in the <div> with the prices using its contents attribute:
In [37]: part.find(attrs={"class": re.compile('abc')}).contents
Out[37]: [<strike>£3.99</strike>, <br />, u'\xa33.59']
All we need to do is extract the number from each item, or skip it if there is no number:
def parse_price(text):
try:
return float(re.search(r'\d*\.\d+', text).group())
except (TypeError, ValueError, AttributeError):
return None
price = []
for item in part.find(attrs={"class": re.compile('abc')}).contents:
item = parse_price(item.string)
if item:
price.append(item)
At this point price will be a list of 0, 1, or 2 floats.
We would like to say
mypart['rrp'], mypart['price'] = price
but that would not work if price is [] or contains only one item.
Your method of handling the three cases with if..else is okay -- it is the most straightforward and arguably the most readable way to proceed. But it is also a bit mundane. If you'd like something a little more terse you could do the following:
Since we want to repeat the same price if price contains only one item, you might be led to think about itertools.cycle.
In the case where price is the empty list, [], we want itertools.cycle([None]), but otherwise we could use itertools.cycle(price).
So to combine both cases into one expression, we could use
price = itertools.cycle(price or [None])
mypart['rrp'], mypart['price'] = next(price), next(price)
The next function peels off the values in the iterator price one by one. Since price is cycling through its values, it will never end; it will just keep yielding the values in sequence and then starting over again if necessary -- which is just what we want.
The A, List, Of, Terms, To, Extract - 1 Feb 2013 could be obtained again through the use of the contents attribute:
# jkl
mypart['other'] = [item for item in part.find('p').contents
if not isinstance(item, bs.Tag) and item.string.strip()]
So, the full runnable code would look like:
import BeautifulSoup as bs
import os
import re
import itertools as IT
def parse_price(text):
try:
return float(re.search(r'\d*\.\d+', text).group())
except (TypeError, ValueError, AttributeError):
return None
filename = os.path.expanduser("~/tmp/file.html")
with open(filename) as fp:
html = fp.read()
soup = bs.BeautifulSoup(html, convertEntities=bs.BeautifulSoup.HTML_ENTITIES)
for part in soup.findAll('div', attrs={"class": re.compile('(?i)part')}):
mypart = {}
# abc
price = []
for item in part.find(attrs={"class": re.compile('abc')}).contents:
item = parse_price(item.string)
if item:
price.append(item)
price = IT.cycle(price or [None])
mypart['rrp'], mypart['price'] = next(price), next(price)
# jkl
mypart['other'] = [item for item in part.find('p').contents
if not isinstance(item, bs.Tag) and item.string.strip()]
print(mypart)
which yields
{'price': 3.59, 'other': [u'A, List, Of, Terms, To, Extract - 1 Feb 2013'], 'rrp': 3.99}

Python, extract tags and also get position of word

I have a String,
data = 'very <strong class="keyword">Awesome</strong> <strong class="keyword">Book</strong> discount'
I want to get the output in a list as
ans = ['very','<strong class="keyword">Awesome</strong>','<strong class="keyword">Book</strong>','discount']
So i can have the position of the word and also the words occurred in tags.
I used BeautifulSoup to extract words in and the word with are not in . But i need to find the position.
The code i tried.
from bs4 import BeautifulSoup as BS
data = 'very <strong class="keyword">Awesome</strong> <strong class="keyword">Book</strong>'
soup = BS(data)
to_extract = soup.findAll('strong')
[comment.extract() for comment in to_extract]
soup = str(soup)
notInStrongWords = []
for t in to_extract:
t_soup = BS('{0}'.format(t))
t_tag = t_soup.strong
matchWords.append(t_tag.string)
soup = re.sub("[^A-Za-z0-9\\-\\.\\(\\)\\\\\/\\&': ]+",' ', soup)
soup = re.findall('[(][^)]*[)]|\S+', soup)
InStrongWords = []
InStrongWords = [x for x in soup]
Thanks in Advance.
Based on Andrew Alcok's answer, Thank you Ansrew.
lets say,
data = ['very <strong class="keyword">Awesome</strong> <strong class="keyword">Book</strong>','<strong class="keyword">Awesome</strong> <strong class="keyword">Book</strong> discount']
so for python 2.x and BeautifulSoup 4
from bs4 import BeautifulSoup as BS
for d in data:
soup = BS(d)
soupPTag = soup.p
if soupPTag:
soupList = [unicode(child) for child in soupPTag.children if child!=" "]
print soupList
else:
soupBodyTag = soup.body
soupList = [unicode(child) for child in soupBodyTag.children if child!=" "]
print soupList
This will give required answer.
Try (for Python 2.x - Python 3 does unicode differently):
from bs4 import BeautifulSoup as BS
data = 'very <strong class="keyword">Awesome</strong> <strong class="keyword">Book</strong>'
soup = BS(data)
pTag = soup.p
list = [ unicode(child) for child in pTag.children ]
print list
Returns:
[u'very ', u'<strong class="keyword">Awesome</strong>', u' ', u'<strong class="keyword">Book</strong>']
Basically, iterating over the child elements and turn them back into Unicode string. You may want to filter out the space, but this is technically present in your HTML.
If you need to check which children are "strong", you could do something like this:
import bs4
data = 'very <strong class="keyword">Awesome</strong> <strong class="keyword">Book</strong>'
soup = bs4.BeautifulSoup(data)
list = [ (child.name if isinstance(child, bs4.Tag) else None, unicode(child)) for child in soup.children ]
print list
Which returns a list of tuples, each tuple being the (name of the tag or None where no tag, HTML):
[(None, u'very '), (u'strong', u'<strong class="keyword">Awesome</strong>'), (None, u' '), (u'strong', u'<strong class="keyword">Book</strong>')]
re.finditer (instead of re.findall) gives you match objects that you can get the start() and end() of.

Categories