I am writing a program that scrapes a website and extracts the names and the links from a table. I store each name and respective link in an object and append it to a list of objects by running a for loop. I want to save this list as a csv file, but haven't been able to do so using the filewriter as it converts the whole object to str, which is not what I want. I would be really grateful if someone can guide me with this. Here is my code:
from selenium import webdriver
from bs4 import BeautifulSoup
class Player():
def __init__(self):
self.name = ""
self.link = ""
player_list = []
driver = webdriver.PhantomJS(executable_path = r'C:\Users\sarim\Desktop\Scraper\phantomjs.exe')
driver.get('https://dak.gg/ranks/na/solo-fpp/rating')
pause = 0
lastHeight = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(pause)
newHeight = driver.execute_script("return document.body.scrollHeight")
if newHeight == lastHeight:
break
lastHeight = newHeight
html_doc = driver.page_source
soup = BeautifulSoup(html_doc, 'lxml')
players = soup.find('table', class_ = 'list rating')
player_names = players.find_all('td', class_ = 'nick')
add_link = "https://dak.gg"
for i in player_names[0:3]:
newPlay = Player()
n = i.find('a')
l = add_link + n['href']
newPlay.link = l
newPlay.name = n.text.strip()
#print newPlay.name
#print newPlay.link
player_list.append(newPlay)
#print players_names[1]
driver.quit()
Try something like this
import csv
with open('data.csv', 'w',) as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['Name', 'Link'])
for player in player_list:
writer.writerow([player.name, player.link])
Related
So i wrote this code for now to web-scrape cnn and get articles about a specific topic:
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
import sys
import json
import os
serch = input('What News are you looking for today? ')
serch_term = str(serch)
real_serch = f'{serch_term}'
url = f'https://edition.cnn.com/search?q={real_serch}'
options = webdriver.ChromeOptions()
options.add_argument("--ignore-certificate-error")
options.add_argument("--ignore-ssl-errors")
service = Service(executable_path='chromedriver.exe')
driver = webdriver.Chrome(service=service, options=options)
driver.get(url)
time.sleep(4)
soup = BeautifulSoup(driver.page_source, 'html.parser')
#driver.close()
SAVED_DATA = "data.json"
def save_data(filepath, data):
with open(filepath, "w") as f:
json.dump(data, f)
def load_data(filepath):
try:
with open(filepath, "r") as f:
data = json.load(f)
return data
except:
return {}
def only_get_title():
for h3 in soup.select('h3.cnn-search__result-headline > a'):
title=h3.text
return(title)
def get_href():
for h3 in soup.select('h3.cnn-search__result-headline > a'):
title = h3.text
url_ = h3.get('href')
abs_url = 'https:'+ url_
return(abs_url)
def store():
data = load_data(SAVED_DATA)
key = only_get_title()
data[key] = get_href()
save_data(SAVED_DATA, data)
print("News saved!")
if __name__ == '__main__':
store()
my question is that in abs_url are stored many links, of the different articles taht were found on that subject on cnn so, i want to go to every of these links and save the data, but it will only open up the first link stored in abs_url and not the other how can i do that i open up every link and save every link in my json file as you can see in the code
You run return inside for-loop so you exit function at first link.
You should add all links to list and use return yourlist after for-loop
def get_href():
all_results = []
# --- loop ---
for h3 in soup.select('h3.cnn-search__result-headline > a'):
title = h3.text
url_ = h3.get('href')
abs_url = 'https:'+ url_
all_results.append( abs_url)
# --- after loop --
return all_results
The same problem you have with titles
def only_get_title():
all_results = []
for h3 in soup.select('h3.cnn-search__result-headline > a'):
title = h3.text
all_results.append(title)
# --- after loop --
return all_results
and later you will need to use for-loop with zip() to create pairs (title, url)
def store():
data = load_data(SAVED_DATA)
all_titles = only_get_title()
all_urls = get_href()
for title, url in zip(all_titles, all_urls):
data[title] = url
save_data(SAVED_DATA, data)
print("News saved!")
But maybe it would be simpler and more readable to get title and url in one function and create pairs when you add to list
def get_articles():
all_results = []
# --- loop ---
for h3 in soup.select('h3.cnn-search__result-headline > a'):
title = h3.text
url = h3.get('href')
abs_url = 'https:'+ url
pair = (title, abs_url)
all_results.append( pair )
# --- after loop --
return all_results
def store():
data = load_data(SAVED_DATA)
all_articles = get_articles()
for title, url in all_articles:
data[title] = url
save_data(SAVED_DATA, data)
print("News saved!")
And this can be also safer when you want to get more details from article because if articel doesn't have some details then you can add None or default value. Using separated function it may skip empty elements and later zip() will create wrog pairs (tuples)
I am trying to extract names in custom <h2>, but the names I want are extracted many times.
how to fix this problem and extract it one time
The page I am pulling data from
here
import requests
import csv
from bs4 import BeautifulSoup
from itertools import zip_longest
lawy_name = []
page_num = 1
phone = []
logo = []
website = []
links = []
while True:
try:
result = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
src = result.content
soup = BeautifulSoup(src, "lxml")
page_limit = int("126")
if(page_num > page_limit // 25):
print("page ended, terminate")
break
lawy_names = soup.select('div.poap.serp-container.lawyer h2.indigo_text')
for i in range(len(lawy_names)) :
lawy_name.append(lawy_names[i].text.strip())
links.append(lawy_names[i].find("a").attrs["href"])
for link in links:
result = requests.get(link)
src = result.content
soup = BeautifulSoup(src, "lxml")
phones = soup.find("a", {"class":"profile-phone-header profile-contact-btn"})
phone.append(phones["href"])
logos = soup.find("div", {"class":"photo-container"})
logo.append(logos.find('img')['src'])
websites = soup.find("a", {"class":"profile-website-header","id":"firm_website"})
website.append(websites.text.strip())
page_num +=1
print("page switched")
except:
print("error")
break
file_list = [lawy_name, phone, website, logo]
exported = zip_longest(*file_list)
with open("/Users/dsoky/Desktop/fonts/Moaaz.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["lawyer name","phone","website","logo"])
wr.writerows(exported)
Problem:
The website does produce a lot of duplicate entries. You could probably assume that all entries have unique names, as such a dictionary could be used to hold all of your data. Simply skip any entries for which you have already seen the same name. For example:
from bs4 import BeautifulSoup
import requests
import csv
lawyers = {}
page_num = 1
while True:
print(f"Page {page_num}")
req = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
soup = BeautifulSoup(req.content, "lxml")
found = False
for id in ['sponsored_serps', 'ts_results', 'poap_results', 'basic_results']:
div_results = soup.find('div', id=id)
if div_results:
for result in div_results.find_all('div', class_='lawyer'):
name = result.h2.get_text(strip=True)
if name not in lawyers:
print(' ', name)
link = result.h2.a['href']
req_details = requests.get(link)
soup_details = BeautifulSoup(req_details.content, "lxml")
a_phone = soup_details.find("a", {"class":"profile-phone-header profile-contact-btn"}, href=True)
if a_phone:
phone = a_phone['href']
else:
phone = None
div_logo = soup_details.find("div", {"class":"photo-container"})
if div_logo.img:
logo = div_logo.img['src']
else:
logo = None
a_website = soup_details.find("a", {"class":"profile-website-header","id":"firm_website"})
if a_website:
website = a_website.get_text(strip=True)
else:
website = None
lawyers[name] = [phone, logo, website]
found = True
# Keep going until no new names found
if found:
page_num += 1
else:
break
with open('Moaaz.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['Name', 'Phone', 'Logo', 'Website'])
for name, details in lawyers.items():
csv_output.writerow([name, *details])
I'm a fresher in this field. So I'm trying to navigate to a web page to scrape a data, when I execute the code, its scrapes the 1st page data but never navigates to the next pages. I have tried in a many ways but I couldn't find. Please check below my code I have written a code for pagination. please anyone help me on this. thanks in advance
import xlwt
from selenium import webdriver
import re
import time
from datetime import date
class kotsovolosmobiles:
def __init__(self):
self.url='https://www.kotsovolos.gr/mobile-phones-gps/mobile-phones/smartphones?pageSize=60'
self.country='GR'
self.currency='euro'
self.VAT= 'Included'
self.shipping = 'Available for shipment'
self.Pre_PromotionPrice ='N/A'
def kotsovolos(self):
wb = xlwt.Workbook()
ws = wb.add_sheet('Sheet1',cell_overwrite_ok=True)
ws.write(0,0,"Product_Url")
ws.write(0,0,"Product_Manufacturer")
ws.write(0,1,"Product_Url")
ws.write(0,2,"Product_Price")
ws.write(0,3,"Product_Model")
ws.write(0,4,"Memory")
ws.write(0,5,"Currency")
ws.write(0,6,"Color")
ws.write(0,7,"VAT")
ws.write(0,8,"Shipping Cost")
ws.write(0,9,"Pre-PromotionPrice")
ws.write(0,10,"Country")
ws.write(0,11,"Date")
ws.write(0,12,"Raw_Model")
wb.save(r"C:\Users\Karthick R\Desktop\VS code\kotsovolos.xls")
driver=webdriver.Chrome()
driver.get(self.url)
today = date.today()
time.sleep(5)
cookies = driver.find_element_by_css_selector('a[id="CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"]')
cookies.click()
print("cookies accepted")
driver.maximize_window()
time.sleep(5)
titles = []
models = []
memorys = []
prod_prices = []
p_links =[]
p_colors = []
while True:
storage_box = []
storage_box = driver.find_elements_by_css_selector('div[class="product"]')
for storage_boxes in storage_box:
product_url = storage_boxes.find_element_by_css_selector('div[class="title"] a').get_attribute('href')
print(product_url)
p_links.append(product_url)
p_model = storage_boxes.find_element_by_css_selector('div[class="title"] a').text
print(p_model)
models.append(p_model)
manufacturer1 = p_model.split(" ")
print(manufacturer1[0])
titles.append(manufacturer1[0])
memory = []
memory = re.findall('\d+ ?[gG][bB]',p_model)
print(memory)
memory1 = str(memory).replace("['",'').replace("']",'').replace("[]",'').strip()
if "," in memory1:
arr=memory1.split(",")
for str1 in arr:
str2=str1.replace("GB", "").replace("gb", "").replace("'", "").strip()
if len(str2)!=1:
memory_str=str1
break
elif (memory1 == ""):
memory_str ='N/A'
else:
memory_str=memory1
memory_str = memory_str.replace("'", "").strip()
print(memory_str)
memorys.append(memory_str)
colors= []
prod_color = p_model.split(" ")
length = len(prod_color)
indexcolor = length-3
colors.append(prod_color[indexcolor])
color1 = str(colors).replace("['",'').replace("']",'').strip()
print(color1)
p_colors.append(color1)
p_price = storage_boxes.find_element_by_css_selector('.priceWithVat > .price').text
print(p_price)
prod_prices.append(p_price)
next = driver.find_element_by_css_selector('.pagination_next a')
time.sleep(3)
next.click()
print("next page")
time.sleep(3)
kotsovolos_gr = kotsovolosmobiles()
kotsovolos_gr.kotsovolos()
Maybe the page isn't loaded yet, try to replace the last block in the loop by
next = driver.find_element_by_css_selector('.pagination_next a')
url = next.get_attribute('href')
driver.get(url)
sleep(3) # Maybe it's not necessary
Try like below. I was able to visit all the other 5 pages.
driver.implicitly_wait(10)
driver.get("https://www.kotsovolos.gr/mobile-phones-gps/mobile-phones/smartphones?pageSize=60")
time.sleep(5) # Would be better to apply Explicit wait to click on `Απόρριψη`
driver.find_element_by_xpath("//a[contains(text(),'Απόρριψη')]").click()
nextbuttons = driver.find_element_by_xpath("//ul[#class='pagination']/li[5]/a")
length = int(nextbuttons.get_attribute("innerText"))
for i in range(2,length+1):
nextopt = driver.find_element_by_xpath("//ul[#class='pagination']/li/a[contains(text(),'{}')]".format(i))
nextopt.click()
time.sleep(5)
class Crawler():
def __init__(self):
self.pag = 1
i = 0
def get_urls(self,main_url):
self.url = 'https://www.test.ro/search/'+ main_url +'/p1'
self.filename = main_url
r = requests.get(self.url)
soup = BeautifulSoup(r.text, 'html.parser')
number_pages = soup.find(class_= 'row' )
last_page = number_pages.find_all('a')[len(number_pages.find_all('a'))-2].get("data-page")
for i in range(1, int(last_page)+1):
url.append('https://www.test.ro/search/'+ main_url +'/p' + str(i))
def print_urls(self):
for urls in url:
print (urls)
def scrape(self,url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
product_list = soup.find(class_ = 'page-container')
product_list_name = product_list.find_all('h2')
product_list_oldprice = product_list.find_all(class_ = 'product-old-price')
product_list_newprice = product_list.find_all(class_ = 'product-new-price')
for i in range(0, len(product_list_name)):
name = product_list_name[i].get_text().strip()
link = product_list_name[i].find('a').get('href')
#print(name)
#print(len(name))
try:
price = product_list_oldprice[i].contents[0].get_text()
price = price[:-6]
#print(price)
except IndexError:
#print("no old price")
#print(product_list_newprice[i].contents[0])
with open(self.filename+'.csv', 'a', encoding = 'utf-8', newline='') as csv_file:
file_is_empty = os.stat(self.filename+'.csv').st_size == 0
fieldname = ['name','link', 'price_old', 'price_actualy']
writer = csv.DictWriter(csv_file, fieldnames = fieldname)
if file_is_empty:
writer.writeheader()
writer.writerow({'name':name,'link':link, 'price_old':price, 'price_actualy':product_list_newprice[i].contents[0]})
if __name__=='__main__':
print("Search for product: ")
urlsearch = input()
starttime = time.time()
scraper = Crawler()
scraper.get_urls(urlsearch)
scraper.print_urls()
#scraper.scrape(url[0])
pool = multiprocessing.Pool()
pool.map(scraper.scrape,url)
pool.close()
print('That took {} seconds'.format(time.time() - starttime))
So I have this scraper, it works perfectly on any website bag but only on the product page.
I did it for a specific website, but how could I go on each page to take the data from the product and give it back and do it all over again?
Is such a thing possible?
I now take the data from the products page, ie name, link, price.
You have divs there too.
Can I help href?
In this case you need to create a category scraper that safes all product urls first. Scrape all urls and go through all the category's and for example safe them to csv first (the product urls). Then you can take all the product urls from the CSV and loop through all of them.
I've been practicing webscraping with the nba.com playerlist, but I've ran into a problem where a link that I scraped in one for loop does not appear when I call on it in another for loop.
I have already tried to make more variables in both the original for loop and the for loop I want the variable to show up in, but it does not appear. I am trying to use the link that I scraped (it is not the full link but rather the end part of the link which I'm trying to combine with the base of the link).
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
class Player():
"""docstring for ClassName"""
def __init__(self):
self.name = ""
self.link = ""
self.PPG = ""
self.RPG = ""
def get_player_list():
opt = webdriver.ChromeOptions()
opt.add_argument('headless')
browser = webdriver.Chrome(options=opt)
browser.get('https://stats.nba.com/players/list/')
soup = BeautifulSoup(browser.page_source, 'lxml')
browser.quit()
names = soup.find('div',class_='stats-player-list players-list')
player_list = []
for name in names.find_all('a'):
new_play = Player()
new_play.name = name.text
new_play.link = name["href"]
player_list.append(new_play)
for one_player in player_list:
print (one_player.name)
print (one_player.link)
return player_list
def get_player_stats(player_list):
opt = webdriver.ChromeOptions()
opt.add_argument('headless')
browser = webdriver.Chrome(options=opt)
for p in player_list[0:2]:
browser.get('https://stats.nba.com'+p.link)
soup = BeautifulSoup(browser.page_source, 'lxml')
browser.quit()
PPG = ""
points1 = soup.find('a',href = '/players/traditional/?sort=PTS&dir=-1')
points = points1.div
for point in points.findNextSiblings():
PPG = "PPG" + point.text
RPG = ""
rebounds1 = soup.find('a',href = '/players/traditional/?sort=REB&dir=-1')
rebounds = rebounds1.div
for rebound in rebounds.findNextSiblings():
RPG = "RPG" + rebound.text
p.PPG = PPG
p.RPG = RPG
browser.quit()
return player_list
player_list = get_player_stats(get_player_list())
As shown in the offset code starting with names.find_all('a'):, everything works properly and the link gets transferred and prints out following the template(ex. Abrines, Alex /player/203518/), but when it gets to for p in player_list[0:2]:, when I put in p.link, it doesn't get transferred over, and when I tried to make it print p.link, nothing got printed. Any help would be appreciated as I've been testing out so many things for a while now!
player_list variable is not a global variable. You define this in a single function, To keep track it in all functions, yous should initialize it globally.
This is because the first href in null.In that case you need to put condition before you adding this in the list.I have added that step now check.
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
class Player():
"""docstring for ClassName"""
def __init__(self):
self.name = ""
self.link = ""
self.PPG = ""
self.RPG = ""
def get_player_list():
opt = webdriver.ChromeOptions()
opt.add_argument('headless')
browser = webdriver.Chrome(options=opt)
browser.get('https://stats.nba.com/players/list/')
soup = BeautifulSoup(browser.page_source, 'lxml')
browser.quit()
names = soup.find('div',class_='stats-player-list players-list')
player_list = []
for name in names.find_all('a'):
if name["href"]:
new_play = Player()
new_play.name = name.text
new_play.link = name["href"]
player_list.append(new_play)
# for one_player in player_list:
# print (one_player.name)
# print (one_player.link)
return player_list
def get_player_stats(player_list):
opt = webdriver.ChromeOptions()
opt.add_argument('headless')
browser = webdriver.Chrome(options=opt)
for p in player_list[0:2]:
print('https://stats.nba.com'+p.link)
browser.get('https://stats.nba.com'+p.link)
soup = BeautifulSoup(browser.page_source, 'lxml')
browser.quit()
PPG = ""
points1 = soup.find('a',href = '/players/traditional/?sort=PTS&dir=-1')
points = points1.div
for point in points.findNextSiblings():
PPG = "PPG" + point.text
RPG = ""
rebounds1 = soup.find('a',href = '/players/traditional/?sort=REB&dir=-1')
rebounds = rebounds1.div
for rebound in rebounds.findNextSiblings():
RPG = "RPG" + rebound.text
p.PPG = PPG
p.RPG = RPG
browser.quit()
return player_list
player_list = get_player_stats(get_player_list())