I am making a project that requires data from imdb business page.I m using python. The data is stored between two tags like this :
Budget
$220,000,000 (estimated)
I want the numeric amount but have not been successful so far. Any suggestions.
Take a look at Beautiful Soup, its a useful library for scraping. If you take a look at the source, the "Budget" is inside an h4 element, and the value is next in the DOM. This may not be the best example, but it works for your case:
import urllib
from bs4 import BeautifulSoup
page = urllib.urlopen('http://www.imdb.com/title/tt0118715/?ref_=fn_al_nm_1a')
soup = BeautifulSoup(page.read())
for h4 in soup.find_all('h4'):
if "Budget:" in h4:
print h4.next_sibling.strip()
# $15,000,000
This is whole bunch of code (you can find your requirement here).
The below Python script will give you, 1) List of Top Box Office movies from IMDb 2) And also the List of Cast for each of them.
from lxml.html import parse
def imdb_bo(no_of_movies=5):
bo_url = 'http://www.imdb.com/chart/'
bo_page = parse(bo_url).getroot()
bo_table = bo_page.cssselect('table.chart')
bo_total = len(bo_table[0][2])
if no_of_movies <= bo_total:
count = no_of_movies
else:
count = bo_total
movies = {}
for i in range(0, count):
mo = {}
mo['url'] = 'http://www.imdb.com'+bo_page.cssselect('td.titleColumn')[i][0].get('href')
mo['title'] = bo_page.cssselect('td.titleColumn')[i][0].text_content().strip()
mo['year'] = bo_page.cssselect('td.titleColumn')[i][1].text_content().strip(" ()")
mo['weekend'] = bo_page.cssselect('td.ratingColumn')[i*2].text_content().strip()
mo['gross'] = bo_page.cssselect('td.ratingColumn')[(i*2)+1][0].text_content().strip()
mo['weeks'] = bo_page.cssselect('td.weeksColumn')[i].text_content().strip()
m_page = parse(mo['url']).getroot()
m_casttable = m_page.cssselect('table.cast_list')
flag = 0
mo['cast'] = []
for cast in m_casttable[0]:
if flag == 0:
flag = 1
else:
m_starname = cast[1][0][0].text_content().strip()
mo['cast'].append(m_starname)
movies[i] = mo
return movies
if __name__ == '__main__':
no_of_movies = raw_input("Enter no. of Box office movies to display:")
bo_movies = imdb_bo(int(no_of_movies))
for k,v in bo_movies.iteritems():
print '#'+str(k+1)+' '+v['title']+' ('+v['year']+')'
print 'URL: '+v['url']
print 'Weekend: '+v['weekend']
print 'Gross: '+v['gross']
print 'Weeks: '+v['weeks']
print 'Cast: '+', '.join(v['cast'])
print '\n'
Output (run in terminal):
parag#parag-innovate:~/python$ python imdb_bo_scraper.py
Enter no. of Box office movies to display:3
#1 Cinderella (2015)
URL: http://www.imdb.com/title/tt1661199?ref_=cht_bo_1
Weekend: $67.88M
Gross: $67.88M
Weeks: 1
Cast: Cate Blanchett, Lily James, Richard Madden, Helena Bonham Carter, Nonso Anozie, Stellan Skarsgård, Sophie McShera, Holliday Grainger, Derek Jacobi, Ben Chaplin, Hayley Atwell, Rob Brydon, Jana Perez, Alex Macqueen, Tom Edden
#2 Run All Night (2015)
URL: http://www.imdb.com/title/tt2199571?ref_=cht_bo_2
Weekend: $11.01M
Gross: $11.01M
Weeks: 1
Cast: Liam Neeson, Ed Harris, Joel Kinnaman, Boyd Holbrook, Bruce McGill, Genesis Rodriguez, Vincent D'Onofrio, Lois Smith, Common, Beau Knapp, Patricia Kalember, Daniel Stewart Sherman, James Martinez, Radivoje Bukvic, Tony Naumovski
#3 Kingsman: The Secret Service (2014)
URL: http://www.imdb.com/title/tt2802144?ref_=cht_bo_3
Weekend: $6.21M
Gross: $107.39M
Weeks: 5
Cast: Adrian Quinton, Colin Firth, Mark Strong, Jonno Davies, Jack Davenport, Alex Nikolov, Samantha Womack, Mark Hamill, Velibor Topic, Sofia Boutella, Samuel L. Jackson, Michael Caine, Taron Egerton, Geoff Bell, Jordan Long
Well you asked for python and you asked for a scraping solution.
But there is no need for python and no need to scrape anything because the budget figures are available in the business.list text file available at http://www.imdb.com/interfaces
Try IMDbPY and its documentation. To install, just pip install imdbpy
from imdb import IMDb
ia = IMDb()
movie = ia.search_movie('The Untouchables')[0]
ia.update(movie)
#Lots of info for the movie from IMDB
movie.keys()
Though I'm not sure where to find specifically budget info
Related
I'm trying to scrape data from Elle.com under a search term. I noticed when I click the button, it sends a request that updates the &page=2 in the url. However, the following code just gets me a lot of duplicate entries. I need help finding a way to set a start point for each iteration of the loop (I think). Any ideas?
import requests,nltk,pandas as pd
from bs4 import BeautifulSoup as bs
def get_hits(url):
r = requests.get(url)
soup = bs(r.content, 'html')
body = []
for p in soup.find_all('p',{'class':'body-text'}):
sentences = nltk.sent_tokenize(p.text)
result1 = [s for s in sentences if 'kim' in s]
body.append(result1)
result2 = [s for s in sentences if 'kanye' in s]
body.append(result2)
body = [a for a in body if a!=[]]
if body == []:
body.append("no hits")
return body
titles =[]
key_hits = []
urls = []
counter = 1
for i in range(1,10):
url = f'https://www.elle.com/search/?page={i}&q=kanye'
r = requests.get(url)
soup = bs(r.content, 'html')
groups = soup.find_all('div',{'class':'simple-item grid-simple-item'})
for j in range(len(groups)):
urls.append('https://www.elle.com'+ groups[j].find('a')['href'])
titles.append(groups[j].find('div',{'class':'simple-item-title item-title'}).text)
key_hits.append(get_hits('https://www.elle.com'+ groups[j].find('a')['href']))
if (counter == 100):
break
counter+=1
data = pd.DataFrame({
'Title':titles,
'Body':key_hits,
'Links':urls
})
data.head()
Let me know if there's something I don't understand that I probably should. Just a marketing researcher trying to learn powerful tools here.
To get pagination working on the sige, you can use their infinite-scroll API URL (this example will print 9*42 titles):
import requests
from bs4 import BeautifulSoup
api_url = "https://www.elle.com/ajax/infiniteload/"
params = {
"id": "search",
"class": "CoreModels\\search\\TagQueryModel",
"viewset": "search",
"trackingId": "search-results",
"trackingLabel": "kanye",
"params": '{"input":"kanye","page_size":"42"}',
"page": "1",
"cachebuster": "undefined",
}
all_titles = set()
for page in range(1, 10):
params["page"] = page
soup = BeautifulSoup(
requests.get(api_url, params=params).content, "html.parser"
)
for title in soup.select(".item-title"):
print(title.text)
all_titles.add(title.text)
print()
print("Unique titles:", len(all_titles)) # <-- 9 * 42 = 378
Prints:
...
Kim Kardashian and Kanye West Respond to Those Divorce Rumors
People Are Noticing Something Fishy About Taylor Swift's Response to Kim Kardashian
Kim Kardashian Just Went on an Intense Twitter Rant Defending Kanye West
Trump Is Finally Able to Secure a Meeting With a Kim
Kim Kardashian West is Modeling Yeezy on the Street Again
Aziz Ansari's Willing to Model Kanye's Clothes
Unique titles: 378
Actually, load more pagination is generating from api calls plain html response and each page link/url is relative url and convert it into absolute url using urljoin method and I make pagination in api_urls.
Code:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
api_urls = ["https://www.elle.com/ajax/infiniteload/?id=search&class=CoreModels%5Csearch%5CTagQueryModel&viewset=search&trackingId=search-results&trackingLabel=kanye¶ms=%7B%22input%22%3A%22kanye%22%2C%22page_size%22%3A%2242%22%7D&page="+str(x)+"&cachebuster=undefined" for x in range(1,4)]
Base_url = "https://store.steampowered.com"
for url in api_urls:
req = requests.get(url)
soup = BeautifulSoup(req.content,"lxml")
cards = soup.select("div.simple-item.grid-simple-item")
for card in cards:
title = card.select_one("div.simple-item-title.item-title")
p = card.select_one("a")
l=p['href']
abs_link=urljoin(Base_url,l)
print("Title:" + title.text + " Links: " + abs_link)
print("-" * 80)
Output:
Title:Inside Kim Kardashian and Kanye West’s Current Relationship Amid Dinner Sighting Links: https://store.steampowered.com/culture/celebrities/a37833256/kim-kardashian-kanye-west-reconciled/
Title:Kim Kardashian And Ex Kanye West Left For SNL Together Amid Reports of Reconciliation Efforts Links: https://store.steampowered.com/culture/celebrities/a37919434/kim-kardashian-kanye-west-leave-for-snl-together-reconciliation/
Title:Kim Kardashian Wore a Purple Catsuit for Dinner With Kanye West Amid Reports She's Open to Reconciling Links: https://store.steampowered.com/culture/celebrities/a37822625/kim-kardashian-kanye-west-nobu-dinner-september-2021/
Title:How Kim Kardashian Really Feels About Kanye West Saying He ‘Wants Her Back’ Now Links:
https://store.steampowered.com/culture/celebrities/a37463258/kim-kardashian-kanye-west-reconciliation-feelings-september-2021/
Title:Why Irina Shayk and Kanye West Called Off Their Two-Month Romance Links: https://store.steampowered.com/culture/celebrities/a37366860/why-irina-shayk-kanye-west-broke-up-august-2021/
Title:Kim Kardashian and Kanye West Reportedly Are ‘Working on Rebuilding’ Relationship and May Call Off Divorce Links: https://store.steampowered.com/culture/celebrities/a37421190/kim-kardashian-kanye-west-repairing-relationship-divorce-august-2021/
Title:What Kim Kardashian and Kanye West's ‘Donda’ Wedding Moment Really Means for Their Relationship Links: https://store.steampowered.com/culture/celebrities/a37415557/kim-kardashian-kanye-west-donda-wedding-moment-explained/
Title:What Kim Kardashian and Kanye West's Relationship Is Like Now: ‘The Tension Has Subsided’ Links: https://store.steampowered.com/culture/celebrities/a37383301/kim-kardashian-kanye-west-relationship-details-august-2021/
Title:How Kim Kardashian and Kanye West’s Relationship as Co-Parents Has Evolved Links: https://store.steampowered.com/culture/celebrities/a37250155/kim-kardashian-kanye-west-co-parents/Title:Kim Kardashian Went Out in a Giant Shaggy Coat and a Black Wrap Top for Dinner in NYC Links: https://store.steampowered.com/culture/celebrities/a37882897/kim-kardashian-shaggy-coat-black-outfit-nyc-dinner/
Title:Kim Kardashian Wore Two Insane, Winter-Ready Outfits in One Warm NYC Day Links: https://store.steampowered.com/culture/celebrities/a37906750/kim-kardashian-overdressed-fall-outfits-october-2021/
Title:Kim Kardashian Dressed Like a Superhero for Justin Bieber's 2021 Met Gala After Party Links: https://store.steampowered.com/culture/celebrities/a37593656/kim-kardashian-superhero-outfit-met-gala-after-party-2021/
Title:Kim Kardashian Killed It In Her Debut as a Saturday Night Live Host Links: https://store.steampowered.com/culture/celebrities/a37918950/kim-kardashian-saturday-night-live-best-sketches/
Title:Kim Kardashian Has Been Working ‘20 Hours a Day’ For Her Appearance On SNL Links: https://store.steampowered.com/culture/celebrities/a37915962/kim-kardashian-saturday-night-live-preperation/
Title:Why Taylor Swift and Joe Alwyn Skipped the 2021 Met Gala Links: https://store.steampowered.com/culture/celebrities/a37446411/why-taylor-swift-joe-alwyn-skipped-met-gala-2021/
Title:Kim Kardashian Says North West Still Wants to Be an Only Child Five Years Into Having Siblings Links: https://store.steampowered.com/culture/celebrities/a37620539/kim-kardashian-north-west-only-child-comment-september-2021/
Title:How Kim Kardashian's Incognito 2021 Met Gala Glam Came Together Links: https://store.s
teampowered.com/beauty/makeup-skin-care/a37584576/kim-kardashians-incognito-2021-met-gala-beauty-breakdown/
Title:Kim Kardashian Completely Covered Her Face and Everything in a Black Balenciaga Look at the 2021 Met Gala Links: https://store.steampowered.com/culture/celebrities/a37578520/kim-kardashian-faceless-outfit-met-gala-2021/
Title:How Kim Kardashian Feels About Kanye West Singing About Their Divorce and ‘Losing My Family’ on Donda Album Links: https://store.steampowered.com/culture/celebrities/a37113130/kim-kardashian-kanye-west-divorce-song-donda-album-feelings/
Title:Kanye West Teases New Song In Beats By Dre Commercial Starring Sha'Carri Richardson Links: https://store.steampowered.com/culture/celebrities/a37090223/kanye-west-teases-new-song-in-beats-by-dre-commercial-starring-shacarri-richardson/
Title:Inside Kim Kardashian and Kanye West's Relationship Amid His Irina Shayk Romance Links: https://store.steampowered.com/culture/celebrities/a37077662/kim-kardashian-kanye-west-relationship-irina-shayk-romance-july-2021/
and ... so on
I am looking to a data science project where I will be able to sum up the fantasy football points by the college the players went to (e.g. Alabama has 56 active players in the NFL so I will go through a database and add up all of their fantasy points to compare with other schools).
I was looking at the website:
https://fantasydata.com/nfl/fantasy-football-leaders?season=2020&seasontype=1&scope=1&subscope=1&aggregatescope=1&range=3
and I was going to use Beautiful Soup to scrape the rows of players and statistics and ultimately, fantasy football points.
However, I am having trouble figuring out how to extract the players' college alma mater. To do so, I would have to:
Click each "players" name
Scrape each and every profile of the hundreds of NFL players for one line "College"
Place all of this information into its own column.
Any suggestions here?
There's no need for Selenium, or other headless, automated browsers. That's overkill.
If you take a look at your browser's network traffic, you'll notice that your browser makes a POST request to this REST API endpoint: https://fantasydata.com/NFL_FantasyStats/FantasyStats_Read
If the POST request is well-formed, the API responds with JSON, containing information about every single player. Normally, this information would be used to populate the DOM asynchronously using JavaScript. There's quite a lot of information there, but unfortunately, the college information isn't part of the JSON response. However, there is a field PlayerUrlString, which is a relative-URL to a given player's profile page, which does contain the college name. So:
Make a POST request to the API to get information about all players
For each player in the response JSON:
Visit that player's profile
Use BeautifulSoup to extract the college name from the current
player's profile
Code:
def main():
import requests
from bs4 import BeautifulSoup
url = "https://fantasydata.com/NFL_FantasyStats/FantasyStats_Read"
data = {
"sort": "FantasyPoints-desc",
"pageSize": "50",
"filters.season": "2020",
"filters.seasontype": "1",
"filters.scope": "1",
"filters.subscope": "1",
"filters.aggregatescope": "1",
"filters.range": "3",
}
response = requests.post(url, data=data)
response.raise_for_status()
players = response.json()["Data"]
for player in players:
url = "https://fantasydata.com" + player["PlayerUrlString"]
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
college = soup.find("dl", {"class": "dl-horizontal"}).findAll("dd")[-1].text.strip()
print(player["Name"] + " went to " + college)
return 0
if __name__ == "__main__":
import sys
sys.exit(main())
Output:
Patrick Mahomes went to Texas Tech
Kyler Murray went to Oklahoma
Aaron Rodgers went to California
Russell Wilson went to Wisconsin
Josh Allen went to Wyoming
Deshaun Watson went to Clemson
Ryan Tannehill went to Texas A&M
Lamar Jackson went to Louisville
Dalvin Cook went to Florida State
...
You can also edit the pageSize POST parameter in the data dictionary. The 50 corresponds to information about the first 50 players in the JSON response (according to the filters set by the other POST parameters). Changing this value will yield more or less players in the JSON response.
I agree, API are the way to go if they are there. My second "go to" is pandas' .read_html() (which uses BeautifulSoup under the hood to parse <table> tags. Here's an alternate solution using ESPNs api to get team roster links, then use pandas to pull the table from each link. Saves you the trouble of having to iterate througheach player to get the college (I whish they just had an api that returned all players. nfl.com USED to have that, but is no longer publicly available, that I know of).
Code:
import requests
import pandas as pd
url = 'https://site.web.api.espn.com/apis/common/v3/sports/football/nfl/athletes/101'
all_teams = []
roster_links = []
for i in range(1,35):
url = 'http://site.api.espn.com/apis/site/v2/sports/football/nfl/teams/{teamId}'.format(teamId=i)
jsonData = requests.get(url).json()
print (jsonData['team']['displayName'])
for link in jsonData['team']['links']:
if link['text'] == 'Roster':
roster_links.append(link['href'])
break
for link in roster_links:
print (link)
tables = pd.read_html(link)
df = pd.concat(tables).drop('Unnamed: 0',axis=1)
df['Jersey'] = df['Name'].str.replace("([A-Za-z.' ]+)", '')
df['Name'] = df['Name'].str.extract("([A-Za-z.' ]+)")
all_teams.append(df)
final_df = pd.concat(all_teams).reset_index(drop=True)
Output:
print (final_df)
Name POS Age HT WT Exp College Jersey
0 Matt Ryan QB 35 6' 4" 217 lbs 13 Boston College 2
1 Matt Schaub QB 39 6' 6" 245 lbs 17 Virginia 8
2 Todd Gurley II RB 26 6' 1" 224 lbs 6 Georgia 21
3 Brian Hill RB 25 6' 1" 219 lbs 4 Wyoming 23
4 Qadree Ollison RB 24 6' 1" 232 lbs 2 Pittsburgh 30
... .. ... ... ... .. ... ...
1772 Jonathan Owens S 25 5' 11" 210 lbs 2 Missouri Western 36
1773 Justin Reid S 23 6' 1" 203 lbs 3 Stanford 20
1774 Ka'imi Fairbairn PK 26 6' 0" 183 lbs 5 UCLA 7
1775 Bryan Anger P 32 6' 3" 205 lbs 9 California 9
1776 Jon Weeks LS 34 5' 10" 242 lbs 11 Baylor 46
[1777 rows x 8 columns]
Hi I am trying to scrape this website with Python 3 and noticed that in the source code it does not give a clear indication of how I would scrape the names of the winners in these primary elections. Can you show me how to scrape a list of all the winners in every MD primary election with this website?
https://elections2018.news.baltimoresun.com/results/
The parsing is a little bit complicated, because the results are in many subpages. This scripts collects them and prints result (all data is stored in variable data):
from bs4 import BeautifulSoup
import requests
url = "https://elections2018.news.baltimoresun.com/results/"
r = requests.get(url)
data = {}
soup = BeautifulSoup(r.text, 'lxml')
for race in soup.select('div[id^=race]'):
r = requests.get(f"https://elections2018.news.baltimoresun.com/results/contests/{race['id'].split('-')[1]}.html")
s = BeautifulSoup(r.text, 'lxml')
l = []
data[(s.find('h3').text, s.find('div', {'class': 'party-header'}).text)] = l
for candidate, votes, percent in zip(s.select('td.candidate'), s.select('td.votes'), s.select('td.percent')):
l.append((candidate.text, votes.text, percent.text))
print('Winners:')
for (race, party), v in data.items():
print(race, party, v[0])
# print(data)
Outputs:
Winners:
Governor / Lt. Governor Democrat ('Ben Jealous and Susan Turnbull', '227,764', '39.6%')
U.S. Senator Republican ('Tony Campbell', '50,915', '29.2%')
U.S. Senator Democrat ('Ben Cardin', '468,909', '80.4%')
State's Attorney Democrat ('Marilyn J. Mosby', '39,519', '49.4%')
County Executive Democrat ('John "Johnny O" Olszewski, Jr.', '27,270', '32.9%')
County Executive Republican ('Al Redmer, Jr.', '17,772', '55.7%')
I'm trying to get both the text and the links as shown in the picture. But I can only get the text via siblings and the links after. I need them to come together like in the image. I tried using br.next_element but it doesn't grab the a-links. What am I missing?
import requests
from bs4 import BeautifulSoup
url_id = 'aM7aW0G58CI'
s = requests.Session()
r = s.get('https://www.youtube.com/watch?v='+url_id)
html = r.text
soup = BeautifulSoup(html, 'lxml')
for i in soup.find_all('p', id='eow-description'):
for br in i.find_all('br'):
next_sib = br.next_sibling
print(next_sib)
for i in soup.find_all('p', id='eow-description'):
for a in i.find_all('a'):
print(a.text)
This is the output that I am getting. I'm not getting what the screenshot below shows.
OutPut:
Special shout to
Wanna support what we do? Livestream at 2PM PT!:
It Wasn’t Me, I Swear!:
TheDeFrancoFam Vlog:
————————————
CATCH UP ON THIS WEEK’S SHOWS:
<br/>
Why People Are Freaking Out About The Trump NFL Boycott and Anthony Weiner Going to Jail…:
WOW! Dirty Advertising Exposed And Major Backlash Following Unexpected Compromise…:
Why Trump's "HUGE Failure" Is A Massive Loss For His Enemies and A Shocking Change To Women's Rights:
DISGUSTING! The Horrible Truth About Belle Gibson Exposed, Controversial Video Blows Up, and More:
<br/>
————————————
GET SOME GEAR:
————————————
FACEBOOK:
TWITTER:
INSTAGRAM:
SNAPCHAT: TheDeFrancoFam
REDDIT:
ITUNES:
GOOGLE PLAY:
————————————
Edited by:
James Girardier -
Jason Mayer -
<br/>
Produced by:
Amanda Morones -
<br/>
Motion Graphics Artist:
Brian Borst -
<br/>
P.O. BOX
Attn: Philip DeFranco
16350 Ventura Blvd
Ste D #542
Encino, CA 91436
http://DKPhil.com
http://DeFrancoElite.com
https://youtu.be/fFxDbYE06zU
https://youtu.be/kR7DquGe4vY
https://youtu.be/qdWUQGHtyPk
https://youtu.be/CWlUs1-7KN4
https://youtu.be/kUWt-oipvOY
https://youtu.be/XVsTh4zxKNo
https://teespring.com/stores/defranco...
http://on.fb.me/mqpRW7
http://Twitter.com/PhillyD
https://instagram.com/phillydefranco/
https://www.reddit.com/r/DeFranco
http://DeFrancoMistakes.com
http://mistakeswithdefranco.com
https://twitter.com/jamesgirardier
https://www.instagram.com/jayjaymay/
https://twitter.com/MandaOhDang
https://twitter.com/brianjborst
Using children and checking tag name (child.name) I made
import requests
from bs4 import BeautifulSoup
url_id = 'aM7aW0G58CI'
s = requests.Session()
r = s.get('https://www.youtube.com/watch?v='+url_id)
soup = BeautifulSoup(r.text, 'lxml')
# to concatenate <br>
br = ''
for p in soup.find_all('p', id='eow-description'):
for child in p.children:
if child.name == 'a':
#print(' a:', child.text)
print(br, child.text)
br = '' # reset br
elif child.name == 'br':
if child.next_sibling.name != 'br': # skip <br/> ?
#print('br:', child.next_sibling)
br += str(child.next_sibling)
#else:
# print(child.name, child)
I get:
Special shout to http://DKPhil.com
Wanna support what we do? Livestream at 2PM PT!: http://DeFrancoElite.com
It Wasn’t Me, I Swear!: https://youtu.be/fFxDbYE06zU
TheDeFrancoFam Vlog: https://youtu.be/kR7DquGe4vY
———————————— CATCH UP ON THIS WEEK’S SHOWS: Why People Are Freaking Out About The Trump NFL Boycott and Anthony Weiner Going to Jail…: https://youtu.be/qdWUQGHtyPk
WOW! Dirty Advertising Exposed And Major Backlash Following Unexpected Compromise…: https://youtu.be/CWlUs1-7KN4
Why Trump's "HUGE Failure" Is A Massive Loss For His Enemies and A Shocking Change To Women's Rights: https://youtu.be/kUWt-oipvOY
DISGUSTING! The Horrible Truth About Belle Gibson Exposed, Controversial Video Blows Up, and More: https://youtu.be/XVsTh4zxKNo
————————————GET SOME GEAR: https://teespring.com/stores/defranco...
————————————FACEBOOK: http://on.fb.me/mqpRW7
TWITTER: http://Twitter.com/PhillyD
INSTAGRAM: https://instagram.com/phillydefranco/
SNAPCHAT: TheDeFrancoFamREDDIT: https://www.reddit.com/r/DeFranco
ITUNES: http://DeFrancoMistakes.com
GOOGLE PLAY: http://mistakeswithdefranco.com
————————————Edited by:James Girardier - https://twitter.com/jamesgirardier
Jason Mayer - https://www.instagram.com/jayjaymay/
Produced by:Amanda Morones - https://twitter.com/MandaOhDang
Motion Graphics Artist:Brian Borst - https://twitter.com/brianjborst
EDIT: you may have to use
else:
print(child.name, child)
to get PO BOX address
I found a really simple way:
for p in soup.find_all('p', id='eow-description'):
print(p.get_text('\n'))
Only issue now is that some of the links are stripped with ...
You can also play around with youtube-dl python module to get the description of a youtube video that way as well.
I have found this way..
import pafy
url='https://www.youtube.com/watch?v=aM7aW0G58CI'
vid=pafy.new(url)
print(vid.description)
By this method, you will get your content in the exact same way as shown in Youtube's video description.
Say if I was given a web page, e.g this, how could I copy the text starting from <root response="True"> and ending at </root>
How could I do this in Python?
import xml.etree.ElementTree as et
import requests
URL = "http://www.omdbapi.com/?t=True%20Grit&r=XML"
def main():
pg = requests.get(URL).content
root = et.fromstring(pg)
for attr,value in root[0].items():
print("{:>10}: {}".format(attr, value))
if __name__=="__main__":
main()
results in
poster: http://ia.media-imdb.com/images/M/MV5BMjIxNjAzODQ0N15BMl5BanBnXkFtZTcwODY2MjMyNA##._V1_SX300.jpg
metascore: 80
director: Ethan Coen, Joel Coen
released: 22 Dec 2010
awards: Nominated for 10 Oscars. Another 30 wins & 85 nominations.
year: 2010
genre: Adventure, Drama, Western
imdbVotes: 184,711
plot: A tough U.S. Marshal helps a stubborn young woman track down her father's murderer.
rated: PG-13
language: English
title: True Grit
country: USA
writer: Joel Coen (screenplay), Ethan Coen (screenplay), Charles Portis (novel)
actors: Jeff Bridges, Hailee Steinfeld, Matt Damon, Josh Brolin
imdbID: tt1403865
runtime: 110 min
type: movie
imdbRating: 7.7
I would use requests and BeautifulSoup for this:
>>> import requests
>>> from bs4 import BeautifulSoup
>>> r = requests.get('http://www.omdbapi.com/?t=True%20Grit&r=XML')
>>> soup = BeautifulSoup(r.text)
>>> list(soup('root')[0].children)
[<movie actors="Jeff Bridges, Hailee Steinfeld, Matt Damon, Josh Brolin" awards="Nominated for 10 Oscars. Another 30 wins & 85 nominations." country="USA" director="Ethan Coen, Joel Coen" genre="Adventure, Drama, Western" imdbid="tt1403865" imdbrating="7.7" imdbvotes="184,711" language="English" metascore="80" plot="A tough U.S. Marshal helps a stubborn young woman track down her father's murderer." poster="http://ia.media-imdb.com/images/M/MV5BMjIxNjAzODQ0N15BMl5BanBnXkFtZTcwODY2MjMyNA##._V1_SX300.jpg" rated="PG-13" released="22 Dec 2010" runtime="110 min" title="True Grit" type="movie" writer="Joel Coen (screenplay), Ethan Coen (screenplay), Charles Portis (novel)" year="2010"></movie>]
Download the document with urllib2: http://docs.python.org/2/howto/urllib2.html
A good parser for short, simple, well formed XML like this is Minidom. Here is how to parse:
http://docs.python.org/2/library/xml.dom.minidom.html
Then get the text, e.g.: Getting text between xml tags with minidom