Web scraping a hidden table using Python - python

I am trying to scrape the "Traits" table from this website https://www.ebi.ac.uk/gwas/genes/SAMD12 (actually, the URL can change according to my necessity, but the structure will be the same).
The problem is that my knowledge is quite limited in web scraping, and I can't get this table using the basic BeautifulSoup workflow I've seen up to here.
Here's my code:
import requests
from bs4 import BeautifulSoup
url = 'https://www.ebi.ac.uk/gwas/genes/SAMD12'
page = requests.get(url)
I'm looking for the "efotrait-table":
efotrait = soup.find('div', id='efotrait-table-loading')
print(efotrait.prettify())
<div class="row" id="efotrait-table-loading" style="margin-top:20px">
<div class="panel panel-default" id="efotrait_panel">
<div class="panel-heading background-color-primary-accent">
<h3 class="panel-title">
<span class="efotrait_label">
Traits
</span>
<span class="efotrait_count badge available-data-btn-badge">
</span>
</h3>
<span class="pull-right">
<span class="clickable" onclick="toggleSidebar('#efotrait_panel span.clickable')" style="margin-left:25px">
<span class="glyphicon glyphicon-chevron-up">
</span>
</span>
</span>
</div>
<div class="panel-body">
<table class="table table-striped borderless" data-export-types="['csv']" data-filter-control="true" data-flat="true" data-icons="icons" data-search="true" data-show-columns="true" data-show-export="true" data-show-multi-sort="false" data-sort-name="numberAssociations" data-sort-order="desc" id="efotrait-table">
</table>
</div>
</div>
</div>
Specifically, this one:
soup.select('table#efotrait-table')[0]
<table class="table table-striped borderless" data-export-types="['csv']" data-filter-control="true" data-flat="true" data-icons="icons" data-search="true" data-show-columns="true" data-show-export="true" data-show-multi-sort="false" data-sort-name="numberAssociations" data-sort-order="desc" id="efotrait-table">
</table>
As you can see, the table's content doesn't show up. In the website, there's an option for saving the table as csv. It would be awesome if I get this downloadable link somehow. But when I click in the link in order to copy it, I get "javascript:void(0)" instead. I've not studied javascript, should I?
The table is hidden, and even if it's not, I would need to interactively select more rows per page to get the whole table (and the URL doesn't change, so I can't get the table either).
I would like to know a way to get access to this table programmatically (unstructured info), then the minors about organizing the table will be fine. Any clues for how doing that (or what I should study) will be greatly appreciated.
Thanks in advance

Desired data is available within API call.
import requests
data = {
"q": "ensemblMappedGenes: \"SAMD12\" OR association_ensemblMappedGenes: \"SAMD12\"",
"max": "99999",
"group.limit": "99999",
"group.field": "resourcename",
"facet.field": "resourcename",
"hl.fl": "shortForm,efoLink",
"hl.snippets": "100",
"fl": "accessionId,ancestralGroups,ancestryLinks,associationCount,association_rsId,authorAscii_s,author_s,authorsList,betaDirection,betaNum,betaUnit,catalogPublishDate,chromLocation,chromosomeName,chromosomePosition,context,countriesOfRecruitment,currentSnp,efoLink,ensemblMappedGenes,fullPvalueSet,genotypingTechnologies,id,initialSampleDescription,label,labelda,mappedLabel,mappedUri,merged,multiSnpHaplotype,numberOfIndividuals,orPerCopyNum,orcid_s,pValueExponent,pValueMantissa,parent,positionLinks,publication,publicationDate,publicationLink,pubmedId,qualifier,range,region,replicateSampleDescription,reportedGene,resourcename,riskFrequency,rsId,shortForm,snpInteraction,strongestAllele,studyId,synonym,title,traitName,traitName_s,traitUri,platform",
"raw": "fq:resourcename:association or resourcename:study"
}
def main(url):
r = requests.post(url, data=data).json()
print(r)
main("https://www.ebi.ac.uk/gwas/api/search/advancefilter")
You can follow the r.keys() and load your desired data by access the dict.
But here's a quick load (Lazy Code):
import requests
import re
import pandas as pd
data = {
"q": "ensemblMappedGenes: \"SAMD12\" OR association_ensemblMappedGenes: \"SAMD12\"",
"max": "99999",
"group.limit": "99999",
"group.field": "resourcename",
"facet.field": "resourcename",
"hl.fl": "shortForm,efoLink",
"hl.snippets": "100",
"fl": "accessionId,ancestralGroups,ancestryLinks,associationCount,association_rsId,authorAscii_s,author_s,authorsList,betaDirection,betaNum,betaUnit,catalogPublishDate,chromLocation,chromosomeName,chromosomePosition,context,countriesOfRecruitment,currentSnp,efoLink,ensemblMappedGenes,fullPvalueSet,genotypingTechnologies,id,initialSampleDescription,label,labelda,mappedLabel,mappedUri,merged,multiSnpHaplotype,numberOfIndividuals,orPerCopyNum,orcid_s,pValueExponent,pValueMantissa,parent,positionLinks,publication,publicationDate,publicationLink,pubmedId,qualifier,range,region,replicateSampleDescription,reportedGene,resourcename,riskFrequency,rsId,shortForm,snpInteraction,strongestAllele,studyId,synonym,title,traitName,traitName_s,traitUri,platform",
"raw": "fq:resourcename:association or resourcename:study"
}
def main(url):
r = requests.post(url, data=data)
match = {item.group(2, 1) for item in re.finditer(
r'traitName_s":\"(.*?)\".*?mappedLabel":\["(.*?)\"', r.text)}
df = pd.DataFrame.from_dict(match)
print(df)
main("https://www.ebi.ac.uk/gwas/api/search/advancefilter")
Output:
0 heel bone mineral density Heel bone mineral density
1 interleukin-8 measurement Chronic obstructive pulmonary disease-related ...
2 self reported educational attainment Educational attainment (years of education)
3 waist-hip ratio Waist-hip ratio
4 eye morphology measurement Eye morphology
5 CC16 measurement Chronic obstructive pulmonary disease-related ...
6 age-related hearing impairment Age-related hearing impairment (SNP x SNP inte...
7 eosinophil percentage of leukocytes Eosinophil percentage of white cells
8 coronary artery calcification Coronary artery calcified atherosclerotic plaq...
9 multiple sclerosis Multiple sclerosis
10 mathematical ability Highest math class taken (MTAG)
11 risk-taking behaviour General risk tolerance (MTAG)
12 coronary artery calcification Coronary artery calcified atherosclerotic plaq...
13 self reported educational attainment Educational attainment (MTAG)
14 pancreatitis Pancreatitis
15 hair colour measurement Hair color
16 breast carcinoma Breast cancer specific mortality in breast cancer
17 eosinophil count Eosinophil counts
18 self rated health Self-rated health
19 bone density Bone mineral density

Related

Extract Colored Text from Table with BeautifulSoup

I am new to Python and fairly new to programming in general. I'm trying to work out a script that uses BeautifulSoup to parse https://www.state.nj.us/mvc/ for any text that's red. The table I'm looking at is relatively simple HTML:
<html>
<body>
<div class="alert alert-warning alert-dismissable" role="alert">
<div class="table-responsive">
<table class="table table-sm" align="center" cellpadding="0" cellspacing="0">
<tbody>
<tr>
<td width="24%">
<strong>
<font color="red">Bakers Basin</font>
</strong>
</td>
<td width="24%">
<strong>Oakland</strong>
</td>
...
...
...
</tr>
</tbody>
</table>
</div>
</div>
</body>
</html>
From the above I want to find Bakers Basin, but not Oakland, for example.
Here's the Python I've written (adapted from Cory Althoff The Self-Taught Programmer, 2017, Triangle Connection LCC):
import urllib.request
from bs4 import BeautifulSoup
class Scraper:
def __init__(self, site):
self.site = site
def scrape(self):
r = urllib.request.urlopen(self.site)
html = r.read()
parser = "html.parser"
soup = BeautifulSoup(html, parser)
tabledmv = soup.find_all("font color=\"red\"")
for tag in tabledmv:
print("\n" + tabledmv.get_text())
website = "https://www.state.nj.us/mvc/"
Scraper(website).scrape()
I seem to be missing something here though because I can't seem to get this to scrape through the table and return anything useful. The end result is I want to add the time module and run this every X minutes, then to have it log a message somewhere for when each site goes red. (This is all so my wife can figure out the least crowded DMV to go to in New Jersey!).
Any help or guidance is much appreciated on getting the BeautifulSoup bit working.
The table is actually loaded from this site.
To only get text that's red you can use the CSS selector soup.select('font[color="red"]') as #Mr. Polywhirl mentioned:
import urllib.request
from bs4 import BeautifulSoup
class Scraper:
def __init__(self, site):
self.site = site
def scrape(self):
r = urllib.request.urlopen(self.site)
html = r.read()
parser = "html.parser"
soup = BeautifulSoup(html, parser)
tabledmv = soup.select('font[color="red"]')[1:]
for tag in tabledmv:
print(tag.get_text())
website = "https://www.state.nj.us/mvc/locations/agency.htm"
Scraper(website).scrape()
The data is loaded from other location, in this case 'https://www.state.nj.us/mvc/locations/agency.htm'. To get the towns + header for each town, you can use this example:
import requests
from bs4 import BeautifulSoup
url = 'https://www.state.nj.us/mvc/locations/agency.htm'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
for t in soup.select('td:has(font)'):
i = t.find_previous('tr').select('td').index(t)
if i < 2:
print('{:<20} {}'.format(' '.join(t.text.split()), 'Licensing Centers'))
else:
print('{:<20} {}'.format(' '.join(t.text.split()), 'Vehicle Centers'))
Prints:
Bakers Basin Licensing Centers
Cherry Hill Vehicle Centers
Springfield Vehicle Centers
Bayonne Licensing Centers
Paterson Licensing Centers
East Orange Vehicle Centers
Trenton Vehicle Centers
Rahway Licensing Centers
Hazlet Vehicle Centers
Turnersville Vehicle Centers
Jersey City Vehicle Centers
Wallington Vehicle Centers
Delanco Licensing Centers
Lakewood Vehicle Centers
Washington Vehicle Centers
Eatontown Licensing Centers
Edison Licensing Centers
Toms River Licensing Centers
Newton Vehicle Centers
Freehold Licensing Centers
Runnemede Vehicle Centers
Newark Licensing Centers
S. Brunswick Vehicle Centers

Can anyone suggest me to take xpath of these fields

<div class="the_content">
<p><strong>Niki Jones Agency, Inc</strong></p>
<p>Ms. Niki Jones</p>
<p>39 Front Street</p>
<p>Port Jervis</p>
<p>NY, 12771</p>
<p>(845) 856-1266</p>
<p>njones#nikijones.com</p>
<p>www.Nikijones.com</p>
<p>20 Years in the PR & Marketing business : Graphic design, Publications, Websites design& Development, Digital Ads, Campaigns, Social Media, Direct Mail, Website security,ADA compliance 508</p>
<div class="apss-social-share apss-theme-1 clearfix">
<div class="the_content">
<p><strong>JMB Electric Supply, LLC</strong></p>
<p>Joanne M. Barish</p>
<p>17 Belmont Street</p>
<p>White Plains, New York 10605</p>
<p>Tel: (914) 260-1895</p>
<p>Fax: 914-722-3277</p>
<p>Email: jmbelec#optionline.net</p>
<p>Website: http://jmbelec.net/</p>
<p>Description: Master distributor of Electronic and Magnetic Low Voltage Transformers & Ballasts selling throughout the United States, as well as internationally.</p>
<div class="apss-social-share apss-theme-1 clearfix">
step 1: Use below xpath to get all div elements.
/div[#class='the_content']
Step 2: for each div elements,
For company name
/p/strong
For other p tags:
/p[2]
2 is the 2nd p tag. So, you can use 3,4,5...
Please see https://www.w3schools.com/xml/xml_xpath.asp

Store web scraping results in DataFrame or dictionary

I'm taking an online course, and I'm trying to automate the process capturing the course structure for my personal notes, which I keep locally in a Markdown file.
Here's an example chapter:
And here's a sample of how the HTML looks:
<!-- Header of the chapter -->
<div class="chapter__header">
<div class="chapter__title-wrapper">
<span class="chapter__number">
<span class="chapter-number">1</span>
</span>
<h4 class="chapter__title">
Introduction to Experimental Design
</h4>
<span class="chapter__price">
Free
</span>
</div>
<div class="dc-progress-bar dc-progress-bar--small chapter__progress">
<span class="dc-progress-bar__text">0%</span>
<div class="dc-progress-bar__bar chapter__progress-bar">
<span class="dc-progress-bar__fill" style="width: 0%;"></span>
</div>
</div>
</div>
<p class="chapter__description">
An introduction to key parts of experimental design plus some power and sample size calculations.
</p>
<!-- !Header of the chapter -->
<!-- Body of the chapter -->
<ul class="chapter__exercises hidden">
<li class="chapter__exercise ">
<a class="chapter__exercise-link" href="https://campus.datacamp.com/courses/experimental-design-in-r/introduction-to-experimental-design?ex=1">
<span class="chapter__exercise-icon exercise-icon ">
<img width="23" height="23" src="https://cdn.datacamp.com/main-app/assets/courses/icon_exercise_video-3b15ea50771db747f7add5f53e535066f57d9f94b4b0ebf1e4ddca0347191bb8.svg" alt="Icon exercise video" />
</span>
<h5 class="chapter__exercise-title" title='Intro to Experimental Design'>Intro to Experimental Design</h5>
<span class="chapter__exercise-xp">
50 xp
</span>
</a> </li>
So far, I've used BeautifulSoup to pull out all the relevant information:
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://www.datacamp.com/courses/experimental-design-in-r'
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
lesson_outline = soup.find_all(['h4', 'li'])
outline_list = []
for item in lesson_outline:
attributes = item.attrs
try:
class_type = attributes['class'][0]
if class_type == 'chapter__title':
outline_list.append(item.text.strip())
if class_type == 'chapter__exercise':
lesson_name = item.find('h5').text
lesson_link = item.find('a').attrs['href']
outline_list.append(lesson_name)
outline_list.append(lesson_link)
except KeyError:
pass
This gives me a list like this:
['Introduction to Experimental Design', 'Intro to Experimental Design', 'https://campus.datacamp.com/courses/experimental-design-in-r/introduction-to-experimental-design?ex=1',...]
My goal is to put this all into an .md file that would look something like this:
# Introduction to Experimental Design
* [Intro to Experimental Design](https://campus.datacamp.com/courses/experimental-design-in-r/introduction-to-experimental-design?ex=1)
* ['A basic experiment](https://campus.datacamp.com/courses/experimental-design-in-r/introduction-to-experimental-design?ex=2)
My question is: What's the best way to structure this data so that I can easily access it later on when I'm writing the text file? Would it be better to have a DataFrame with columns chapter, lesson, lesson_link? A DataFrame with a MultiIndex? A nested dictionary? If it were a dictionary, what should I name the keys? Or is there another option I'm missing? Some sort of database?
Any thoughts would be much appreciated!
If I see it right, you're currently appending every element in order of it's appearance to the list outline_list. But obviously you don't have 1, but instead 3 types of distinct data:
chapter__title
chapter__exercise.name
chapter__exercise.link
Each title can have multiple exercises, which are always a pair of name and link. Since you also want to keep the data in this structure for your text-file, you can come up with any structure that represents this hierarchy. An example:
from urllib.request import urlopen
from bs4 import BeautifulSoup
from collections import OrderedDict
url = 'https://www.datacamp.com/courses/experimental-design-in-r'
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
lesson_outline = soup.find_all(['h4', 'li'])
# Using OrderedDict assures that the order of the result will be the same as in the source
chapters = OrderedDict() # {chapter: [(lesson_name, lesson_link), ...], ...}
for item in lesson_outline:
attributes = item.attrs
try:
class_type = attributes['class'][0]
if class_type == 'chapter__title':
chapter = item.text.strip()
chapters[chapter] = []
if class_type == 'chapter__exercise':
lesson_name = item.find('h5').text
lesson_link = item.find('a').attrs['href']
chapters[chapter].append((lesson_name, lesson_link))
except KeyError:
pass
From there it should be easy to write your text file:
for chapter, lessons in chapters.items():
# write chapter title
for lesson_name, lesson_link in lessons:
# write lesson

Can't scrape particular items from some elements

What to do when there is no container or group to select to parse the required items (which are common in each group) looping through it? I'm willing to parse the text, date and author from the pasted elements. The three results I am after do not belong to any particular group or container so I can't find the right way to get them creating a loop.
Here are the elements:
html = '''
<div class="view-content">
<p class="text-large experts-more-h">
We Have No Idea if Universal Preschool Actually Helps Kids
</p>
<p class="text-sans">
By David J. Armor. Washington Post. <span class="date-display-single" property="dc:date" datatype="xsd:dateTime" content="2014-10-21T09:34:00-04:00">October 21, 2014</span>.
</p>
<p class="text-large experts-more-h">
At Last, Parent Resistance to Collective Standardized Tests
</p>
<p class="text-sans">
By Nat Hentoff. Cato.org. <span class="date-display-single" property="dc:date" datatype="xsd:dateTime" content="2014-01-15T09:57:00-05:00">January 15, 2014</span>.
</p>
<p class="text-sans">
By Darcy Ann Olsen and Eric Olsen. Cato.org. <span class="date-display-single" property="dc:date" datatype="xsd:dateTime" content="1999-04-15T00:00:00-04:00">April 15, 1999</span>.
</p>
<p class="text-large experts-more-h">
Day Care: Parents versus Professional Advocates
</p>
<p class="text-sans">
By Darcy Ann Olsen. Cato.org. <span class="date-display-single" property="dc:date" datatype="xsd:dateTime" content="1998-06-01T00:00:00-04:00">June 1, 1998</span>.
</p>
</div>
'''
If you run my script, you can see that the scraped results is only the first one:
from lxml.html import fromstring
tree = fromstring(html)
post= tree.cssselect(".text-large a")[0].text
date = tree.cssselect(".date-display-single")[0].text
author = tree.cssselect(".text-sans")[0].text.strip()
print(post+'\n', date+'\n', author)
Result:
We Have No Idea if Universal Preschool Actually Helps Kids
October 21, 2014
By David J. Armor. Washington Post.
If you run this one, you will see that this script is able to parse all the results I'm after:
from lxml.html import fromstring
tree = fromstring(html)
count = tree.cssselect(".text-large a")
for item in range(len(count)):
post= tree.cssselect(".text-large a")[item].text
date = tree.cssselect(".date-display-single")[item].text
author = tree.cssselect(".text-sans")[item].text.strip()
print(post+'\n', date+'\n', author)
Results:
We Have No Idea if Universal Preschool Actually Helps Kids
October 21, 2014
By David J. Armor. Washington Post.
At Last, Parent Resistance to Collective Standardized Tests
January 15, 2014
By Nat Hentoff. Cato.org.
Day Care: Parents versus Professional Advocates
April 15, 1999
By Darcy Ann Olsen and Eric Olsen. Cato.org.
However, what i did with my second script is not at all pythonic and it will give wrong results if any data is missing. So, how to select a group or container, loop through it and parse all of them? Thanks in advance.
If one of text nodes (post, date, author) is missing, tree.cssselect(selector)[index].text should return you a NoneType object which you cannot handle as a string. To avoid this you can implement
post= tree.cssselect(".text-large a")[item].text or " "
You can also try below XPath solution:
container = tree.cssselect(".text-large")
for item in container:
post = item.xpath('./a')[0].text or " "
date = item.xpath('./following-sibling::p/span[#class="date-display-single"]')[0].text or " "
author = item.xpath('./following-sibling::p[#class="text-sans"]')[0].text.strip() or " "
print(post+'\n', date+'\n', author)

Python 2.7.10 Trying to print text from website using Beautiful Soup 4

I want my output to be like:
count:0 - Bournemouth and Watford to go head-to-head for Abdisalam Ibrahim
Olympiacos midfielder Abdisalam Ibrahim is a target for Premier League new-boys Bournemouth and Watford.The former Manchester City man is keen to leave Greece this summer, and his potential availability has alerted Eddie Howe and Quique Sanchez Flores.Lorient of Ligue 1 and La Liga's Rayo Vallacano are also interested in the 24-year-old.
Count:1 - Andre-Pierre Gignac set for Mexico move
Former West Brom target Andre-Pierre Gignac is to complete a move to Mexican side Tigres.The France international is a free agent after leaving Marseille and is set to undergo a medical later today.West Ham, Stoke, Newcastle, West Brom and Dynamo Moscow all showed interest in the 30-year-old although Tony Pulis is understood to have cooled his interest after watching Gignac against Monaco towards the end of last season.
My Program:
from bs4 import BeautifulSoup
import urllib2
response = urllib2.urlopen('http://www.dailymail.co.uk/sport/football/article-3129389/Transfer-News-LIVE-Manchester-United-Arsenal-Liverpool-Real-Madrid-Barcelona-latest-plus-rest-Europe.html')
html = response.read()
soup = BeautifulSoup(html)
count=0
for tag in soup.find_all("div", {"id":"lc-commentary-posts"}):
divTaginb = tag.find_all("div", {"class":"lc-title-container"})
divTaginp = tag.find_all("div",{"class":"lc-post-body"})
for tag1 in divTaginb:
h4Tag = tag1.find_all("b")
for tag2 in h4Tag:
print "count:%d - "%count,
print tag2.text
print '\n'
tagp = divTaginp[count].find_all('p')
for p in tagp:
print p
print '\n'
count +=1
My output:
Count:0 - ....
...
count:37 - ICYMI: Hamburg target Celtic star Stefan Johansen as part of summer
rebuilding process
<p><strong>STEPHEN MCGOWAN:</strong> Bundesliga giants Hamburg have been linked
with a move for CelticΓÇÖs PFA Scotland player of the year Stefan Johansen.</p>
<p>German newspapers claim the Norwegian features on a three-man shortlist of po
tential signings for HSV as part of their summer rebuilding process.</p>
<p>Hamburg scouts are reported to have watched Johansen during Friday nightΓÇÖs
scoreless Euro 2016 qualifier draw with Azerbaijan.</p>
<p><a href="http://www.dailymail.co.uk/sport/football/article-3128854/Hamburg-ta
rget-Celtic-star-Stefan-Johansen-summer-rebuilding-process.html"><strong>CLICK H
ERE for more</strong></a></p>
count:38 - ICYMI: Sevilla agree deal with Chelsea to sign out-of-contract midfi
elder Gael Kakuta
<p>Sevilla have agreed a deal with Premier League champions Chelsea to sign out-
of-contract winger Gael Kakuta.</p>
<p>The French winger, who spent last season on loan in the Primera Division with
Rayo Vallecano, will arrive in Seville on Thursday to undergo a medical with th
e back-to-back Europa League winners.</p>
<p>A statement published on Sevilla's official website confirmed the 23-year-old
's transfer would go through if 'everything goes well' in the Andalusian city.</
p>
<p><strong><a href="http://www.dailymail.co.uk/sport/football/article-3128756/Se
villa-agree-deal-Chelsea-sign-Gael-Kakuta-contract-winger-aims-resurrect-career-
Europa-League-winners.html">CLICK HERE for more</a></strong></p>
count:39 - Good morning everybody!
<p>And welcome to <em>Sportsmail's</em> coverage of all the potential movers and
shakers ahead of the forthcoming summer transfer window.</p>
<p>Whatever deals will be rumoured, agreed or confirmed today you can read all
about them here.</p>
DailyMail Website looks like this:
<div id="lc-commentary-posts"><div id="lc-id-39" class="lc-commentary-post cleared">
<div class="lc-icons">
<img src="http://i.mol.im/i/furniture/live_commentary/football_icons/teams/60x60_bournemouth.png" class="lc-icon">
<img src="http://i.mol.im/i/furniture/live_commentary/football_icons/teams/60x60_watford.png" class="lc-icon">
<div class="lc-post-time">18:03 </div>
</div>
<div class="lc-title-container">
<h4>
<b>Bournemouth and Watford to go head-to-head for Abdisalam Ibrahim</b>
</h4>
</div>
<div class="lc-post-body">
<p><strong>SAMI MOKBEL: </strong>Olympiacos midfielder Abdisalam Ibrahim is a target for Premier League new-boys Bournemouth and Watford.</p>
<p class="mol-para-with-font">The former Manchester City man is keen to leave Greece this summer, and his potential availability has alerted Eddie Howe and Quique Sanchez Flores.</p>
<p class="mol-para-with-font"><font>Lorient of Ligue 1 and La Liga's Rayo Vallacano are also interested in the 24-year-old.</font></p>
</div>
<img class="lc-post-image" src="http://i.dailymail.co.uk/i/pix/2015/06/18/18/1434647000147_lc_galleryImage_TEL_AVIV_ISRAEL_JUNE_11_A.JPG">
<b class="lc-image-caption">Abdisalam Ibrahim could return to England</b>
<div class="lc-clear"></div>
<ul class="lc-social">
<li class="lc-facebook"><span onclick="window.LiveCommentary.socialShare(postToFB, '39', 'facebook')"></span></li>
<li class="lc-twitter"><span onclick="window.LiveCommentary.socialShare(postToTWTTR, '39', 'twitter', window.twitterVia)"></span></li>
</ul>
</div>
<div id="lc-id-38" class="lc-commentary-post cleared">
<div class="lc-icons">
<img src="http://i.mol.im/i/furniture/live_commentary/football_icons/teams/60x60_west_brom.png" class="lc-icon">
<img src="http://i.mol.im/i/furniture/live_commentary/flags/60x60_mexico.png" class="lc-icon">
<div class="lc-post-time">16:54 </div>
</div>
<div class="lc-title-container">
<span><b>Andre-Pierre Gignac set for Mexico move</b></span>
</div>
<div class="lc-post-body">
<p>Former West Brom target Andre-Pierre Gignac is to complete a move to Mexican side Tigres.</p>
<p id="ext-gen225">The France international is a free agent after leaving Marseille and is set to undergo a medical later today.</p>
<p>West Ham, Stoke, Newcastle, West Brom and Dynamo Moscow all showed interest in the 30-year-old although Tony Pulis is understood to have cooled his interest after watching Gignac against Monaco towards the end of last season.</p>
</div>
<img class="lc-post-image" src="http://i.dailymail.co.uk/i/pix/2015/06/18/16/1434642784396_lc_galleryImage__FILES_A_file_picture_tak.JPG">
<b class="lc-image-caption">Andre-Pierre Gignac is to complete a move to Mexican side Tigres</b>
<div class="lc-clear"></div>
<ul class="lc-social">
<li class="lc-facebook"><span onclick="window.LiveCommentary.socialShare(postToFB, '38', 'facebook')"></span></li>
<li class="lc-twitter"><span onclick="window.LiveCommentary.socialShare(postToTWTTR, '38', 'twitter', window.twitterVia)"></span></li>
</ul>
</div>
Now my target is <div class="lc-title-container"> inside this <b></b>.Which I am getting easily. But when I am targeting <div class="lc-post-body"> inside this all <p></p>. I am not able to get only required text.
I tried p.text and p.strip() but still I am not able to solve my problem.
Error while using p.text
count:19 - City's pursuit of Sterling, Wilshere and Fabian Delph show a need fo
r English quality
MIKE KEEGAN: Colonial explorer Cecil Rhodes is famously reported to have once sa
id that to be an Englishman 'is to have won first prize in the lottery of life'.
Back in the 19th century, the vicar's son was no doubt preaching about the expan
ding Empire and his own experiences in Africa.
Traceback (most recent call last):
File "app.py", line 24, in <module>
print p.text
File "C:\Python27\lib\encodings\cp437.py", line 12, in encode
return codecs.charmap_encode(input,errors,encoding_map)
UnicodeEncodeError: 'charmap' codec can't encode character u'\u2013' in position
160: character maps to <undefined>
And while i am using p.strip() I am not getting any output.
Is there any good way to do it. Help me get the best way. I am trying this thing from morning and now its night.
I dont want to use any encoder or decoder if possible
dammit = UnicodeDammit(html) print(dammit.unicode_markup)
Here's my code. You should go though it. I was to lazy to add specific fields for the dataset and instead just combined everything.
from bs4 import BeautifulSoup, element
import urllib2
response = urllib2.urlopen('http://www.dailymail.co.uk/sport/football/article-3129389/Transfer-News-LIVE-Manchester-United-Arsenal-Liverpool-Real-Madrid-Barcelona-latest-plus-rest-Europe.html')
html = response.read()
soup = BeautifulSoup(html)
count=0
article_dataset = {}
# Try to make your variables express what your trying to do.
# Collect article posts
article_post_tags = soup.find_all("div", {"id":"lc-commentary-posts"})
# Set up the aricle_dataset with the artilce name as it's key
for article_post_tag in article_post_tags:
container_tags = article_post_tag.find_all("div", {"class":"lc-title-container"})
body_tags = article_post_tag.find_all("div",{"class":"lc-post-body"})
# Find the article name, and initialize an empty dict as the value
for count, container in enumerate(container_tags):
# We know there is only 1 <b> tag in our container,
# so use find() instead of find_all()
article_name_tag = container.find('b')
# Our primary key is the article name, the corrosponding value is the body_tag.
article_dataset[article_name_tag.text] = {'body_tag':body_tags[count]}
for article_name, details in article_dataset.items():
content = []
content_line_tags = details['body_tag'].find_all('p')
# Go through each tag and collect the text
for content_tag in content_line_tags:
for data in content_tag.contents: # gather strings in our tags
if type(data) == element.NavigableString:
data = unicode(data)
else:
data = data.text
content += [data]
# combine the content
content = '\n'.join(content)
# Add the content to our data
article_dataset[article_name]['content'] = content
# remove the body_tag from our aricle data_set
for name, details in article_dataset.items():
del details['body_tag']
print
print
print 'Artilce Name: ' + name
print 'Player: ' + details['content'].split('\n')[0]
print 'Article Summary: ' + details['content']
print

Categories