What is wrong with my web scraper code (python3.4) - python

I am trying to scrape a table from a website. It runs but I am not getting an output to my file. Where am I going wrong?
Code:
from bs4 import BeautifulSoup
import urllib.request
f = open('nbapro.txt','w')
errorFile = open('nbaerror.txt','w')
page = urllib.request.urlopen('http://www.numberfire.com/nba/fantasy/full-fantasy-basketball-projections')
content = page.read()
soup = BeautifulSoup(content)
tableStats = soup.find('table', {'class': 'data-table xsmall'})
for row in tableStats.findAll('tr')[2:]:
col = row.findAll('td')
try:
name = col[0].a.string.strip()
f.write(name+'\n')
except Exception as e:
errorFile.write (str(e) + '******'+ str(col) + '\n')
pass
f.close
errorFile.close

The problem is that the table data you are trying to scrape is filled out by invoking javascript code on the browser-side. urllib is not a browser and, hence, cannot execute javascript.
If you want to solve it via urllib and BeautifulSoup, you have to extract the JSON object from the script tag and load it via json.loads(). Example, that prints player names:
import json
import re
import urllib.request
from bs4 import BeautifulSoup
soup = BeautifulSoup(urllib.request.urlopen('http://www.numberfire.com/nba/fantasy/full-fantasy-basketball-projections'))
script = soup.find('script', text=lambda x: x and 'NF_DATA' in x).text
data = re.search(r'NF_DATA = (.*?);', script).group(1)
data = json.loads(data)
for player_id, player in data['players'].items():
print(player['name'] + ' ' + player['last_name'])

Related

Using multiple for loop with Python Using Beautiful Soup

from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
url = "https://www.property24.com/for-sale/woodland-hills-wildlife-estate/bloemfontein/free-state/10467/109825373"
data = requests.get(url)
soup = bs(data.content,"html.parser")
The code below are a test with to get 1 item.
property_overview = soup.find(class_="p24_regularListing").find(class_="p24_propertyOverview").find(class_='p24_propertyOverviewRow').find(class_='col-xs-6 p24_propertyOverviewKey').text
property_overview
Output : 'Listing Number'
The code below is what we have to get all the col-xs-6 p24_propertyOverviewKey
p24_regularListing_items = soup.find_all(class_="p24_regularListing")
for p24_propertyOverview_item in p24_regularListing_items:
p24_propertyOverview_items = p24_propertyOverview_item.find_all(class_="p24_propertyOverview")
for p24_propertyOverviewRow_item in p24_propertyOverview_items:
p24_propertyOverviewRow_items = p24_propertyOverviewRow_item.find_all(class_="p24_propertyOverviewRow")
for p24_propertyOverviewKey_item in p24_propertyOverviewRow_items:
p24_propertyOverviewKey_items = p24_propertyOverviewKey_item.find_all(class_="col-xs-6 p24_propertyOverviewKey")
p24_propertyOverviewKey_items
The code above only outputs 1 item. and not all
To put things more simply, you can use soup.select() (and via the comments, you can then use .get_text() to extract the text from each tag).
from bs4 import BeautifulSoup
import requests
resp = requests.get(
"https://www.property24.com/for-sale/woodland-hills-wildlife-estate/bloemfontein/free-state/10467/109825373"
)
resp.raise_for_status()
soup = BeautifulSoup(resp.content, "html.parser")
texts = []
for tag in soup.select(
# NB: this selector uses Python's implicit string concatenation
# to split it onto several lines.
".p24_regularListing "
".p24_propertyOverview "
".p24_propertyOverviewRow "
".p24_propertyOverviewKey"
):
texts.append(tag.get_text())
print(texts)

Save and Scraping multiple pages with BeautifulSoup and pandas

I tested my code with jupiter notebook with this code
...
rname = soup.find('p', 'con_tx')
#rnamelis = rname.findAll('p')
rname
from urllib.request import urljoin
story=[]
#review_text = lis[0].find('p').getText()
#list_soup =soup.find_all('p', 'con_tx')
story=rname.getText()
story
and it worked well.
(result) '전 여친에 ...'
But when I tried to scrape multiple pages
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.request import urljoin
import pandas as pd
import numpy as np
import requests
base_url = 'https://movie.naver.com/movie/bi/mi/basic.nhn?code='
pages =['177374','164102']
url = base_url + pages[0]
story = []
for n in pages:
# Create url
url = base_url + n
# Parse data using BS
print('Downloading page %s...' % url)
res = requests.get(url)
res.raise_for_status()
html = urlopen(url)
soup = BeautifulSoup(html, "html.parser")
#print(soup.find('p', 'con_tx'))
rname = soup.find('p', 'con_tx')
story=rname.getText()
data = {story}
df = pd.DataFrame(data)
df.head()
df.to_csv('./moviestory.csv', sep=',', encoding='EUC-KR')
An error message came out.
ValueError: DataFrame constructor not properly called!
How do I fix my code?
Crawling area
Not sure what you are trying to do, but one thing I'm noticing is you are overwriting your dataframe each time. Also don;t know why you initialise story as a list, and then store it as a dictionary in the loop.
from bs4 import BeautifulSoup
import pandas as pd
import requests
base_url = 'https://movie.naver.com/movie/bi/mi/basic.nhn?code='
pages =['177374','164102']
df = pd.DataFrame()
for n in pages:
# Create url
url = base_url + n
# Parse data using BS
print('Downloading page %s...' % url)
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
rname = soup.find('p', 'con_tx')
story=rname.getText()
data = [story]
df = df.append(pd.DataFrame(data), sort=True).reset_index(drop=True)
df.to_csv('./moviestory.csv', sep=',', encoding='EUC-KR')

Movie review crawling

I want to crawl all these movie reviews in this page.
Which part in red circle
I tried to crawl with this code. (I used Jupiter Notebook-Anaconda3)
import requests
from bs4 import BeautifulSoup
test_url = "https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=174903&type=after&page=1"
resp = requests.get(test_url)
soup = BeautifulSoup(resp.content, 'html.parser')
soup
score_result = soup.find('div', {'class': 'score_result'})
lis = score_result.findAll('li')
lis[:3]
from urllib.request import urljoin #When I ran this block and next block it didn't save any reviews.
review_text=[]
#review_text = lis[0].find('p').getText()
list_soup =soup.find_all('li', 'p')
for item in list_soup:
review_text.append(item.find('p').get_text())
review_text[:5] #Nothing was saved.
As I wrote in third block and forth block nothing was saved. What is the problem?
This will get what you want. Tested in python within Jupyter Notebook (latest)
import requests
from bs4 import BeautifulSoup
from bs4.element import NavigableString
test_url = "https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=174903&type=after&page=1"
resp = requests.get(test_url)
soup = BeautifulSoup(resp.content, 'html.parser')
movie_lst = soup.select_one('div.score_result')
ul_movie_lst = movie_lst.ul
for movie in ul_movie_lst:
if isinstance(movie, NavigableString):
continue
score = movie.select_one('div.star_score em').text
name = movie.select_one('div.score_reple p span').text
review = movie.select_one('div.score_reple dl dt em a span').text
print(score + "\t" + name)
print("\t" + review)
Preview

Getting rid of html tags in python when scraping

So I'm trying to scrape the a box score for an NBA game from ESPN. I tried to get the names first but I'm having a difficult time getting rid of the html tags.
I've tried using
get_text(), .text(), .string_strip()
but they keep giving me errors.
Here's the code I'm working with right now.
from bs4 import BeautifulSoup
import requests
url= "http://scores.espn.com/nba/boxscore?gameId=400900407"
r = requests.get(url)
soup = BeautifulSoup(r.text,"html.parser")
name = []
for row in soup.find_all('tr')[1:]:
player_name = row.find('td', attrs={'class': 'name'})
name.append(player_name)
print(name)
Using player_name.text should work, but the problem is that sometimes row.find('td', attrs={'class': 'name'} is empty. Try like this:
if player_name:
name.append(player_name.text)
I solve this like that:
from bs4 import BeautifulSoup
import requests
url= "http://scores.espn.com/nba/boxscore?gameId=400900407"
r = requests.get(url)
soup = BeautifulSoup(r.text,"html.parser")
name = []
for row in soup.find_all('tr')[1:]:
try:
player_name = row.select('td.name span')[0].text
name.append(player_name)
except:
pass
print(name)
My code for your reference
import requests
from pyquery import PyQuery as pyq
url= "http://scores.espn.com/nba/boxscore?gameId=400900407"
r = requests.get(url)
doc = pyq(r.content)
print([h.text() for h in doc('.abbr').items()])

I want to crawl data from 1 to 10 pages automatically from website.How can i do it?

import requests
from bs4 import BeautifulSoup
My_Url = "http://questions.consumercomplaints.in/page/2"
Data = requests.get(My_Url)
Soup = BeautifulSoup(Data.content)
head_id = Soup.find_all({"div":"href"})
len(head_id)
for i in head_id:
print i.text
From above code i scrapped (reviews/complaints) from web page 2.
How do i craw data automatically all pages (http://questions.consumercomplaints.in/page/3)
Why not surround your function in a ranged for loop?
import requests
from bs4 import BeautifulSoup
for i in range(3,11):
My_Url = "http://questions.consumercomplaints.in/page/" + str(i)
Data = requests.get(My_Url)
Soup = BeautifulSoup(Data.content)
head_id = Soup.find_all({"div":"href"})
len(head_id)
for i in head_id:
print i.text
Have look at how the range function works here.

Categories