I am trying to retrieve keywords of a particular IEEE document. I came across this code here
ieee_content = requests.get(link, timeout=180)
soup = BeautifulSoup(ieee_content.text, 'lxml')
tag = soup.find_all('script')
#metadata = "".join(re.findall('global.document.metadata=(.*)', tag[9].text)).replace(";", '').replace('global.document.metadata=', '')
for i in tag[9]:
metadata_format = re.compile(r'global.document.metadata=.*', re.MULTILINE)
metadata = re.findall(metadata_format, i)
if len(metadata) != 0:
# convert the list
convert_to_json = json.dumps(metadata)
x = json.loads(convert_to_json)
s = x[0].replace("'", '"').replace(";", '')
The problem is that my metadata variable is always empty. I tried to iterate across all tags rather than using tag[9], but metadata is still empty in all cases. I tried using 'xml' instead of 'lmxl' as well but the result is the same. I'd appreciate some help with this.
import json
import re
from pprint import pprint
import requests
from bs4 import BeautifulSoup
ieee_content = requests.get("https://ieeexplore.ieee.org/document/7845555", timeout=180)
soup = BeautifulSoup(ieee_content.content, "html.parser")
scripts = soup.find_all("script")
pattern = re.compile(r"(?<=\"keywords\":)\[{.*?}\]")
keywords_dict = {}
for i, script in enumerate(scripts):
keywords = re.findall(pattern, str(script.string))
if len(keywords) == 1:
raw_keywords_list = json.loads(keywords[0])
for keyword_type in raw_keywords_list:
keywords_dict[keyword_type["type"].strip()] = [kwd.strip() for kwd in keyword_type["kwd"]]
pprint(keywords_dict)
I am new to python. is anyone know {sum(int(td.text) for td in soup.select('td:last-child')[1:])} what is use of [1:] in this or [0] or [1]. i saw it in many scraping examples below for in loop. As i was practicing i build this code and don't able to scrape all data in csv file. thanks in advance, sorry for two question at one time.
import requests
from bs4 import BeautifulSoup
import csv
url= "https://iplt20.com/stats/2020/most-runs"
r= requests.get (url)
soup= BeautifulSoup (r.content, 'html5lib')
lst= []
table=soup.find ('div', attrs = {'class':'js-table'})
#for row in table.findAll ('div', attrs= {'class':'top-players__player-name'}):
# score = {}
# score['Player'] = row.a.text.strip()
# lst.append(score)
for row in table.findAll (class_='top-players__m top-players__padded '):
score = {}
score['Matches'] = int(row.td.text)
lst.append(score)
filename= 'iplStat.csv'
with open (filename, 'w', newline='') as f:
w= csv.DictWriter(f,['Player', 'Matches'])
w.writeheader()
for score in lst:
w.writerow(score)
print (lst)
All of this is not even needed. Just use pandas:
import requests
import pandas as pd
url = "https://iplt20.com/stats/2020/most-runs"
r = requests.get (url)
df = pd.read_html(r.content)[0]
df.to_csv("iplStats.csv", index = False)
Screenshot of csv file:
When I write to the csv file all of my data is printed in only the first column. Using my loop, how do I iterate along the columns to write the data?
import csv
import bs4
import urllib
from urllib.request import urlopen as uReq
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
#For sites that can't be opened due to Urllib blocker, use a Mozilla User agent to get access
pageRequest = Request('https://coronavirusbellcurve.com/', headers = {'User-Agent': 'Mozilla/5.0'})
htmlPage = urlopen(pageRequest).read()
page_soup = soup(htmlPage, 'html.parser')
specificDiv = page_soup.find("div", {"class": "table-responsive-xl"})
TbodyStats = specificDiv.table.tbody.tr.contents
TbodyDates = specificDiv.table.thead.tr.contents
with open('CovidHTML.csv','w', newline= '') as file:
theWriter = csv.writer(file)
theWriter.writerow(['5/4', ' 5/5', ' 5/6',' 5/7',' 5/8',' 5/9'])
for i in range(3,len(TbodyStats)):
if i%2 != 0:
theWriter.writerow([TbodyStats[i].text])
Another method, For reference only.
from simplified_scrapy import SimplifiedDoc,utils,req
html = req.get('https://coronavirusbellcurve.com/')
doc = SimplifiedDoc(html)
specificDiv = doc.select('div.table-responsive-xl') # Get first div. If you want to get all divs, use this method: doc.selects('div.table-responsive-xl')
# TbodyStats = specificDiv.tbody.trs.selects('td|th').text # Get data
# TbodyDates = specificDiv.thead.trs.selects('td|th').text # Get date
data = specificDiv.table.trs.selects('td|th').text # Get all
rows = []
for row in data:
rows.append(row[1:])
utils.save2csv('test.csv',rows)
Result:
5/5,5/6,5/7,5/8,5/9
1213260,1237960,1266822,1294664,1314610
24423,24700,28862,27842,19946
2.05%,2.04%,2.33%,2.20%,1.54%
I think you may be able to do this (I can't test for sure because I don't have your exact data on hand):
row = []
for i in range(3, len(TbodyStats), 2):
row.append(TbodyStats[i].text)
if len(row) == 6:
theWriter.writerow(row)
row = []
I added the 'step' to your range so you don't have to use % for finding odd numbered indices, then just built each row until it hits 6 members, then flush that to the csv file, then empty the row so you can repeat the process.
Currently writing a webscraper to scrape some reviews. The goal is to scrape reviews over multiple URLS. Therefore, i made a list of urls. I want to retrieve the content of the specific reviews per url and merge them in one list.
When i only scrape one page, everything works like a charm. However, when i try to scrape multiple pagines. See the following code plus error:
from lxml import html
from urllib import request
import requests
from datetime import datetime
import dateparser
import csv
import re
links = open('file')
urls = links.readlines()
for url in urls:
req=requests.get(url)
tree = html.fromstring(request.urlopen(req).read().decode(encoding="utf-8",errors="ignore"))
reviews = tree.xpath('//*[#class="review-body"]')
reviews = [r.text_content() for r in reviews]
reviews = [r.replace('\n', ' ') for r in reviews]
reviews = [r.replace('\r', ' ') for r in reviews]
reviews = [r.replace(' ', '') for r in reviews]
protocol = req.type
AttributeError: 'Response' object has no attribute 'type'.
Can somebody explain to me what this is and how i can solve this?
You need to have reviews list outside of for loop.
This way you will fill it while iterating.
You can either:
append temporary list of reviews in each loop step (temp) and then you will have reviews = [ [...], [...]] or
add temporary list with + operator e.g. reviews += temp which should result in what you probably expect to end up with.
Here is the possible resolution:
from lxml import html
from urllib import request
import requests
from datetime import datetime
import dateparser
import csv
import re
links = open('file', 'r')
reviews = []
for url in links:
req = requests.get(url)
tree = html.fromstring(req.content.decode(encoding="utf-8", errors="ignore"))
temp = tree.xpath('//*[#class="review-body"]')
temp = [r.text_content() for r in temp]
temp = [r.replace('\n', ' ') for r in temp]
temp = [r.replace('\r', ' ') for r in temp]
temp = [r.replace(' ', '') for r in temp]
reviews += temp
As for the AttributeError it seems you are trying to access attribute that doesn't exist.
Edit 1.
links is an iterable that can be iterated to fetch line by line. This way you do not have to read all lines in memory.
req has a content and text attributes. Both holds page HTML source just depending on encoding.
Been using beautiful soup to iterate through pages, but for whatever reason I can't get the loop to advance beyond the first page. it seems like it should be easy because it's a text string, but it seems to loop back, maybe it's my structure not my text string?
Here's what I have:
import csv
import urllib2
from bs4 import BeautifulSoup
f = open('nhlstats.csv', "w")
groups=['points', 'shooting', 'goaltending', 'defensive', 'timeonice', 'faceoffs', 'minor-penalties', 'major-penalties']
year = ["2016", "2015","2014","2013","2012"]
for yr in year:
for gr in groups:
url = "http://www.espn.com/nhl/statistics/player/_/stat/points/year/"+str(yr)
#www.espn.com/nhl/statistics/player/_/stat/points/year/2014/
page = urllib2.urlopen(url)
soup=BeautifulSoup(page, "html.parser")
pagecount = soup.findAll(attrs= {"class":"page-numbers"})[0].string
pageliteral = int(pagecount[5:])
for i in range(0,pageliteral):
number = int(((i*40) + 1))
URL = "http://www.espn.com/nhl/statistics/player/_/stat/points/sort/points/year/"+str(yr) + "/count/"+str(number)
page = urllib2.urlopen(url)
soup=BeautifulSoup(page, "html.parser")
for tr in soup.select("#my-players-table tr[class*=player]"):
row =[]
for ob in range(1,15):
player_info = tr('td')[ob].get_text(strip=True)
row.append(player_info)
f.write(str(yr) +","+",".join(row) + "\n")
f.close()
this gets the same first 40 records over and over.
I tried using this solution as an if and did find that doing
prevLink = soup.select('a[rel="nofollow"]')[0]
newurl = "http:" + prevLink.get('href')
did work better, but I'm not sure how to do the loop in such a way that it advances? possibly just tired but my loop there still just goes to the next set of records and gets stuck on that one. please help me fix my loop
UPDATE
my formatting was lost in the copy paste, my actual code looks like:
import csv
import urllib2
from bs4 import BeautifulSoup
f = open('nhlstats.csv', "w")
groups=['points', 'shooting', 'goaltending', 'defensive', 'timeonice', 'faceoffs', 'minor-penalties', 'major-penalties']
year = ["2016", "2015","2014","2013","2012"]
for yr in year:
for gr in groups:
url = "http://www.espn.com/nhl/statistics/player/_/stat/points/year/"+str(yr)
#www.espn.com/nhl/statistics/player/_/stat/points/year/2014/
page = urllib2.urlopen(url)
soup=BeautifulSoup(page, "html.parser")
pagecount = soup.findAll(attrs= {"class":"page-numbers"})[0].string
pageliteral = int(pagecount[5:])
for i in range(0,pageliteral):
number = int(((i*40) + 1))
URL = "http://www.espn.com/nhl/statistics/player/_/stat/points/sort/points/year/"+str(yr) + "/count/"+str(number)
page = urllib2.urlopen(url)
soup=BeautifulSoup(page, "html.parser")
for tr in soup.select("#my-players-table tr[class*=player]"):
row =[]
for ob in range(1,15):
player_info = tr('td')[ob].get_text(strip=True)
row.append(player_info)
f.write(str(yr) +","+",".join(row) + "\n")
f.close()
Your code indenting was mostly at fault. Also it would be wise to actually use the CSV library you imported, this will automatically wrap the player names in quotes to avoid any commas inside from ruining the csv structure.
This works by looking for the link to the next page and extracting the starting count. This is then used to build your the next page get. If no next page can be found, it moves to the next year group. Note, the count is not a page count but a starting entry count.
import csv
import urllib2
from bs4 import BeautifulSoup
groups= ['points', 'shooting', 'goaltending', 'defensive', 'timeonice', 'faceoffs', 'minor-penalties', 'major-penalties']
year = ["2016", "2015", "2014", "2013", "2012"]
with open('nhlstats.csv', "wb") as f_output:
csv_output = csv.writer(f_output)
for yr in year:
for gr in groups:
start_count = 1
while True:
#print "{}, {}, {}".format(yr, gr, start_count) # show progress
url = "http://www.espn.com/nhl/statistics/player/_/stat/points/sort/points/year/{}/count/{}".format(yr, start_count)
page = urllib2.urlopen(url)
soup = BeautifulSoup(page, "html.parser")
for tr in soup.select("#my-players-table tr[class*=player]"):
row = [yr]
for ob in range(1, 15):
player_info = tr('td')[ob].get_text(strip=True)
row.append(player_info)
csv_output.writerow(row)
try:
start_count = int(soup.find(attrs= {"class":"page-numbers"}).find_next('a')['href'].rsplit('/', 1)[1])
except:
break
Using with will also automatically close your file at the end.
This would give you a csv file starting as follows:
2016,"Patrick Kane, RW",CHI,82,46,60,106,17,30,1.29,287,16.0,9,17,20
2016,"Jamie Benn, LW",DAL,82,41,48,89,7,64,1.09,247,16.6,5,17,13
2016,"Sidney Crosby, C",PIT,80,36,49,85,19,42,1.06,248,14.5,9,10,14
2016,"Joe Thornton, C",SJ,82,19,63,82,25,54,1.00,121,15.7,6,8,21
You are changing the URL many times before you are opening it the first time, due to an indentation error. Try this:
for gr in groups:
url = "...some_url..."
page = urllib2.urlopen(url)
...everything else should be indented....