I've been practicing webscraping with the nba.com playerlist, but I've ran into a problem where a link that I scraped in one for loop does not appear when I call on it in another for loop.
I have already tried to make more variables in both the original for loop and the for loop I want the variable to show up in, but it does not appear. I am trying to use the link that I scraped (it is not the full link but rather the end part of the link which I'm trying to combine with the base of the link).
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
class Player():
"""docstring for ClassName"""
def __init__(self):
self.name = ""
self.link = ""
self.PPG = ""
self.RPG = ""
def get_player_list():
opt = webdriver.ChromeOptions()
opt.add_argument('headless')
browser = webdriver.Chrome(options=opt)
browser.get('https://stats.nba.com/players/list/')
soup = BeautifulSoup(browser.page_source, 'lxml')
browser.quit()
names = soup.find('div',class_='stats-player-list players-list')
player_list = []
for name in names.find_all('a'):
new_play = Player()
new_play.name = name.text
new_play.link = name["href"]
player_list.append(new_play)
for one_player in player_list:
print (one_player.name)
print (one_player.link)
return player_list
def get_player_stats(player_list):
opt = webdriver.ChromeOptions()
opt.add_argument('headless')
browser = webdriver.Chrome(options=opt)
for p in player_list[0:2]:
browser.get('https://stats.nba.com'+p.link)
soup = BeautifulSoup(browser.page_source, 'lxml')
browser.quit()
PPG = ""
points1 = soup.find('a',href = '/players/traditional/?sort=PTS&dir=-1')
points = points1.div
for point in points.findNextSiblings():
PPG = "PPG" + point.text
RPG = ""
rebounds1 = soup.find('a',href = '/players/traditional/?sort=REB&dir=-1')
rebounds = rebounds1.div
for rebound in rebounds.findNextSiblings():
RPG = "RPG" + rebound.text
p.PPG = PPG
p.RPG = RPG
browser.quit()
return player_list
player_list = get_player_stats(get_player_list())
As shown in the offset code starting with names.find_all('a'):, everything works properly and the link gets transferred and prints out following the template(ex. Abrines, Alex /player/203518/), but when it gets to for p in player_list[0:2]:, when I put in p.link, it doesn't get transferred over, and when I tried to make it print p.link, nothing got printed. Any help would be appreciated as I've been testing out so many things for a while now!
player_list variable is not a global variable. You define this in a single function, To keep track it in all functions, yous should initialize it globally.
This is because the first href in null.In that case you need to put condition before you adding this in the list.I have added that step now check.
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
class Player():
"""docstring for ClassName"""
def __init__(self):
self.name = ""
self.link = ""
self.PPG = ""
self.RPG = ""
def get_player_list():
opt = webdriver.ChromeOptions()
opt.add_argument('headless')
browser = webdriver.Chrome(options=opt)
browser.get('https://stats.nba.com/players/list/')
soup = BeautifulSoup(browser.page_source, 'lxml')
browser.quit()
names = soup.find('div',class_='stats-player-list players-list')
player_list = []
for name in names.find_all('a'):
if name["href"]:
new_play = Player()
new_play.name = name.text
new_play.link = name["href"]
player_list.append(new_play)
# for one_player in player_list:
# print (one_player.name)
# print (one_player.link)
return player_list
def get_player_stats(player_list):
opt = webdriver.ChromeOptions()
opt.add_argument('headless')
browser = webdriver.Chrome(options=opt)
for p in player_list[0:2]:
print('https://stats.nba.com'+p.link)
browser.get('https://stats.nba.com'+p.link)
soup = BeautifulSoup(browser.page_source, 'lxml')
browser.quit()
PPG = ""
points1 = soup.find('a',href = '/players/traditional/?sort=PTS&dir=-1')
points = points1.div
for point in points.findNextSiblings():
PPG = "PPG" + point.text
RPG = ""
rebounds1 = soup.find('a',href = '/players/traditional/?sort=REB&dir=-1')
rebounds = rebounds1.div
for rebound in rebounds.findNextSiblings():
RPG = "RPG" + rebound.text
p.PPG = PPG
p.RPG = RPG
browser.quit()
return player_list
player_list = get_player_stats(get_player_list())
Related
I'm a fresher in this field. So I'm trying to navigate to a web page to scrape a data, when I execute the code, its scrapes the 1st page data but never navigates to the next pages. I have tried in a many ways but I couldn't find. Please check below my code I have written a code for pagination. please anyone help me on this. thanks in advance
import xlwt
from selenium import webdriver
import re
import time
from datetime import date
class kotsovolosmobiles:
def __init__(self):
self.url='https://www.kotsovolos.gr/mobile-phones-gps/mobile-phones/smartphones?pageSize=60'
self.country='GR'
self.currency='euro'
self.VAT= 'Included'
self.shipping = 'Available for shipment'
self.Pre_PromotionPrice ='N/A'
def kotsovolos(self):
wb = xlwt.Workbook()
ws = wb.add_sheet('Sheet1',cell_overwrite_ok=True)
ws.write(0,0,"Product_Url")
ws.write(0,0,"Product_Manufacturer")
ws.write(0,1,"Product_Url")
ws.write(0,2,"Product_Price")
ws.write(0,3,"Product_Model")
ws.write(0,4,"Memory")
ws.write(0,5,"Currency")
ws.write(0,6,"Color")
ws.write(0,7,"VAT")
ws.write(0,8,"Shipping Cost")
ws.write(0,9,"Pre-PromotionPrice")
ws.write(0,10,"Country")
ws.write(0,11,"Date")
ws.write(0,12,"Raw_Model")
wb.save(r"C:\Users\Karthick R\Desktop\VS code\kotsovolos.xls")
driver=webdriver.Chrome()
driver.get(self.url)
today = date.today()
time.sleep(5)
cookies = driver.find_element_by_css_selector('a[id="CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"]')
cookies.click()
print("cookies accepted")
driver.maximize_window()
time.sleep(5)
titles = []
models = []
memorys = []
prod_prices = []
p_links =[]
p_colors = []
while True:
storage_box = []
storage_box = driver.find_elements_by_css_selector('div[class="product"]')
for storage_boxes in storage_box:
product_url = storage_boxes.find_element_by_css_selector('div[class="title"] a').get_attribute('href')
print(product_url)
p_links.append(product_url)
p_model = storage_boxes.find_element_by_css_selector('div[class="title"] a').text
print(p_model)
models.append(p_model)
manufacturer1 = p_model.split(" ")
print(manufacturer1[0])
titles.append(manufacturer1[0])
memory = []
memory = re.findall('\d+ ?[gG][bB]',p_model)
print(memory)
memory1 = str(memory).replace("['",'').replace("']",'').replace("[]",'').strip()
if "," in memory1:
arr=memory1.split(",")
for str1 in arr:
str2=str1.replace("GB", "").replace("gb", "").replace("'", "").strip()
if len(str2)!=1:
memory_str=str1
break
elif (memory1 == ""):
memory_str ='N/A'
else:
memory_str=memory1
memory_str = memory_str.replace("'", "").strip()
print(memory_str)
memorys.append(memory_str)
colors= []
prod_color = p_model.split(" ")
length = len(prod_color)
indexcolor = length-3
colors.append(prod_color[indexcolor])
color1 = str(colors).replace("['",'').replace("']",'').strip()
print(color1)
p_colors.append(color1)
p_price = storage_boxes.find_element_by_css_selector('.priceWithVat > .price').text
print(p_price)
prod_prices.append(p_price)
next = driver.find_element_by_css_selector('.pagination_next a')
time.sleep(3)
next.click()
print("next page")
time.sleep(3)
kotsovolos_gr = kotsovolosmobiles()
kotsovolos_gr.kotsovolos()
Maybe the page isn't loaded yet, try to replace the last block in the loop by
next = driver.find_element_by_css_selector('.pagination_next a')
url = next.get_attribute('href')
driver.get(url)
sleep(3) # Maybe it's not necessary
Try like below. I was able to visit all the other 5 pages.
driver.implicitly_wait(10)
driver.get("https://www.kotsovolos.gr/mobile-phones-gps/mobile-phones/smartphones?pageSize=60")
time.sleep(5) # Would be better to apply Explicit wait to click on `Απόρριψη`
driver.find_element_by_xpath("//a[contains(text(),'Απόρριψη')]").click()
nextbuttons = driver.find_element_by_xpath("//ul[#class='pagination']/li[5]/a")
length = int(nextbuttons.get_attribute("innerText"))
for i in range(2,length+1):
nextopt = driver.find_element_by_xpath("//ul[#class='pagination']/li/a[contains(text(),'{}')]".format(i))
nextopt.click()
time.sleep(5)
I'm scraping this website but I have a problem after I run the codes where it only print the first events and the for loop is not iterating.
To append Events as List
class Events(object):
def __init__(self, title=None, place=None, date_posted=None, description=None, linkhref=None):
self.title = title
self.place = place
self.date_posted = date_posted
self.description = description
self.linkhref = linkhref
I want to save it in CSV
sv_file = open('scrape2.csv', 'w', encoding="utf-8")
csv_writer = csv.writer(csv_file)
# Columns
csv_writer.writerow(['title', 'link', 'place', 'date_posted', 'description'])
In this part I want to scrape the website from 3 pages and find the details such as date, event name, venue and description of each events.
def scrape():
for page in range(0, 2):
page = page + 1
base_url = 'https://allevents.in/malacca/all?ref=cityhome-popular' + str(page)
source = requests.get(base_url)
soup = BeautifulSoup(source.text, "html.parser")
all_event = soup.find_all('div', class_="event-list listview")
events = []
for item in all_event:
title = item.find("h3").text.strip()
link = item.find("h3")
linkhref = link.find("a").get('href')
place = item.find("p", {"class":"location"}).text.strip()
date_posted = item.find("div", {"class":"right"}).text.strip()
description = item.find("p", {"class":"short-desc"}).text.strip()
csv_writer.writerow([title, link, place, date_posted, description])
events.append(Events(title, link, place, date_posted, description))
This is the for loop to iterate List
for event in events:
print("Title: " + event.title)
print("Link: " + event.linkhref)
print("Place: " + str(event.place))
print("Date: " + event.date_posted)
print("Description: " + event.description)
print("-----------------------------------")
csv_file.close()
return 1
if __name__ == "__main__":
print(scrape())
You can fetch and write the results in different ways. Moreover, you can make use of namedtuple to shake off verbosity. Here is how I would do it.
import csv
import requests
from bs4 import BeautifulSoup
from collections import namedtuple
class Events(object):
def __init__(self):
self.base_url = 'https://allevents.in/malacca/all?ref=cityhome-popular'
self.items = namedtuple('itemDocument', ['title','linkhref','place','date_posted','description'])
def scrape(self):
source = requests.get(self.base_url)
soup = BeautifulSoup(source.text,"lxml")
for item in soup.find_all('div', class_="event-item"):
title = item.find("h3").text.strip()
linkhref = item.find("h3").find("a").get('href')
place = ' '.join(item.find("p", {"class":"location"}).text.split())
date_posted = item.find("div", {"class":"right"}).text.strip()
description = item.find("p", {"class":"short-desc"}).text.strip()
yield self.items(title,linkhref,place,date_posted,description)
if __name__ == "__main__":
scraper = Events()
with open("outputFile.csv","w",newline="",encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(['title','linkhref','place','date_posted','description'])
for item in scraper.scrape():
writer.writerow([item.title,item.linkhref,item.place,item.date_posted,item.description])
Now, you can use the logic of traversing different pages within the above script as I've kicked out for brevity.
Because the 'find' keyword only returns the first tagged element it finds. To return all the elements with specified tag such as "h3" I am not sure but it is "find_all" keyword. Do check it up first. And mark this answer as useful if it solves your issue.
You are almost there; just change the middle section of your code, beginning of the assignment of all_event:
all_event = soup.find_all('h3')
events = []
for item in all_event:
title = item.a.text
linkhref = item.a['href']
place = item.findNext('span').text.strip()
date_posted = item.findNext('div', class_="right").text.strip()
description = item.findNext('p', class_="short-desc").text.strip()
It should work from there, maybe with some modifications.
I would like to scrape a website. Website has 10 preview of complaints in each page. I wrote this script to get links of 10 complaints and some info inside of each link. When I run the script I got this error message "RecursionError: maximum recursion depth exceeded".
Can someone say to me what is the problem. Thank you in advance!!
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
# Create list objects for each information section
C_date = []
C_title = []
C_text = []
U_name = []
U_id = []
C_count = []
R_name = []
R_date = []
R_text = []
# Get 10 links for preview of complaints
def getLinks(url):
response = get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')
c_containers = html_soup.find_all('div', class_='media')
# Store wanted links in a list
allLinks = []
for link in c_containers:
find_tag = link.find('a')
find_links = find_tag.get('href')
full_link = "".join((url, find_links))
allLinks.append(full_link)
# Get total number of links
print(len(allLinks))
return allLinks
def GetData(Each_Link):
each_complaint_page = get(Each_Link)
html_soup = BeautifulSoup(each_complaint_page.text, 'html.parser')
# Get date of complaint
dt = html_soup.main.find('span')
date = dt['title']
C_date.append(date)
# Get Title of complaint
TL = html_soup.main.find('h1', {'class': 'title'})
Title = TL.text
C_title.append(Title)
# Get main text of complaint
Tx = html_soup.main.find('div', {'class': 'description'})
Text = Tx.text
C_text.append(Text)
# Get user name and id
Uname = html_soup.main.find('span', {'class': 'user'})
User_name = Uname.span.text
User_id = Uname.attrs['data-memberid']
U_name.append(User_name)
U_id.append(User_id)
# Get view count of complaint
Vcount = html_soup.main.find('span', {'view-count-detail'})
View_count = Vcount.text
C_count.append(View_count)
# Get reply for complaint
Rpnm = html_soup.main.find('h4', {'name'})
Reply_name = Rpnm.next
R_name.append(Reply_name)
# Get reply date
Rpdt = html_soup.main.find('span', {'date-tips'})
Reply_date = Rpdt.attrs['title']
R_date.append(Reply_date)
# Get reply text
Rptx = html_soup.main.find('p', {'comment-content-msg company-comment-msg'})
Reply_text = Rptx.text
R_text.append(Reply_text)
link_list = getLinks('https://www.sikayetvar.com/arcelik')
for i in link_list:
z = GetData(i)
print(z)
PS: My next step will be to put all information in a data frame
Your GetData() method calls itself, with no base-case: this causes infinite recursion:
def GetData(data):
for i in GetData(data):
You're also calling response = get(i) but then ignoring the result... perhaps you meant to say
def GetData(link):
i = get(link)
...
I am writing a program that scrapes a website and extracts the names and the links from a table. I store each name and respective link in an object and append it to a list of objects by running a for loop. I want to save this list as a csv file, but haven't been able to do so using the filewriter as it converts the whole object to str, which is not what I want. I would be really grateful if someone can guide me with this. Here is my code:
from selenium import webdriver
from bs4 import BeautifulSoup
class Player():
def __init__(self):
self.name = ""
self.link = ""
player_list = []
driver = webdriver.PhantomJS(executable_path = r'C:\Users\sarim\Desktop\Scraper\phantomjs.exe')
driver.get('https://dak.gg/ranks/na/solo-fpp/rating')
pause = 0
lastHeight = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(pause)
newHeight = driver.execute_script("return document.body.scrollHeight")
if newHeight == lastHeight:
break
lastHeight = newHeight
html_doc = driver.page_source
soup = BeautifulSoup(html_doc, 'lxml')
players = soup.find('table', class_ = 'list rating')
player_names = players.find_all('td', class_ = 'nick')
add_link = "https://dak.gg"
for i in player_names[0:3]:
newPlay = Player()
n = i.find('a')
l = add_link + n['href']
newPlay.link = l
newPlay.name = n.text.strip()
#print newPlay.name
#print newPlay.link
player_list.append(newPlay)
#print players_names[1]
driver.quit()
Try something like this
import csv
with open('data.csv', 'w',) as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['Name', 'Link'])
for player in player_list:
writer.writerow([player.name, player.link])
I'm trying many methods to solve this data, but it can't work. strip() and replace() methods look like in the picture one, it doesn't work. Please help me.
` improt requests
from lxml import html,etree
from selenium import webdriver
import time
file_name = 'dubanxinlixue.json'
driver = webdriver.Chrome()
url_string = []
name_data, price_data = [], []
jd_goods_data = {}
page = 0
while True:
url = 'https://book.douban.com/tag/%E5%BF%83%E7%90%86%E5%AD%A6?start={page}&type=S'.format(page=page)
url_string.append(url)
page += 20
if page > 980:
break
for i in url_string:
driver.get(i)
base_html = driver.page_source
selctor = etree.HTML(base_html)
j = 1
for j in range(20):
j += 1
name = '//*[#id="subject_list"]/ul/li[%d]/div[2]/h2/a[1]/#title'%(j)
get_name =selctor.xpath(name)[0]
describe = '//*[#id="subject_list"]/ul/li[%d]/div[2]/div[1]/text()'%(j)
get_describe = selctor.xpath(describe)[0]
get_describe.string.strip()
print(get_describe)`
the get_describe looks like this ,[the result of get_describe][1]
new_get_describe = get_describe.strip()
print(new_get_describe)