Python/selenium Webscraping LinkedIn does not work invalid link? - python

driver.get('https://www.linkedin.com')
username = driver.find_element(By.ID, 'session_key')
username.send_keys('*********')
sleep(0.5)
password = driver.find_element(By.ID,'session_password')
password.send_keys('******')
sleep(0.5)
sign_in_button = driver.find_element(By.XPATH,'//*[#type="submit"]')
sign_in_button.click()
sleep(15)
Companydata = []
lnks = []
for x in range(0,20,10):
driver.get(f'https://www.google.com/search?q=site%3Alinkedin.com%2Fcompany+AND+%22ICT%22+AND+%22Nederland%22&client=safari&sxsrf=AJOqlzX8ktoj6XZ-yaWqpd6O-Q31wvoxsg%3A1676293189155&ei=RTTqY96MCd2F9u8Pi_ig-As&ved=0ahUKEwiew9DgxpL9AhXdgv0HHQs8CL8Q4dUDCA8&uact=5&oq=site%3Alinkedin.com%2Fcompany+AND+%22ICT%22+AND+%22Nederland%22&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQA0oECEEYAUoECEYYAFDHKljmQmC2Q2gDcAB4AIABOYgBsQWSAQIxNpgBAKABAcABAQ')
time.sleep(random.uniform(2.5,4.9))
linkedin_url = [my_elem.get_attribute("href")for my_elem in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH,"//div[#class='yuRUbf']")))]
linkedin_url = str(linkedin_url)
lnks.append(linkedin_url)
sleep(2)
for x in lnks:
for i in x:
url = i
driver.get(i)
time.sleep(random.uniform(2.5,4.9))
sel = Selector(text=driver.page_source)
name = sel.xpath('//*[starts-with(#class, "ember-view t-24 t-black t-bold full-width")]/text()').extract()
if name:
name = name.strip()
Company_branch = sel.xpath('//*[starts-with(#class,"org-top-card-summary-info-list__info-item")]/text()').extract()
if Company_branch:
Company_branch = Company_branch.strip()
location = sel.xpath('//*[starts-with(#class,"inline-block")]/text()').extract()
if location:
location = location.strip()
name = validate_field(name)
Company_branch = validate_field(Company_branch)
location = validate_field(location)
url = validate_field(url)
print('\n')
print('Name: ' + name)
print('Bedrijfstak: ' + Company_branch)
print('Location: ' + location)
print('URL ' + url)
print('\n')
data = {
'Name' : name,
'Bedrijfstak' : Company_branch,
'Location' : location,
'URL' : url
}
Companydata.append(data)
df = pd.DataFrame(Companydata)
df.to_excel('Companydata_linkedin.xlsx')
driver.quit()
It looks like the problem started in this line of code:
for x in range(0,20,10):
driver.get(f'https://www.google.com/search?q=site%3Alinkedin.com%2Fcompany+AND+%22ICT%22+AND+%22Nederland%22&client=safari&sxsrf=AJOqlzX8ktoj6XZ-yaWqpd6O-Q31wvoxsg%3A1676293189155&ei=RTTqY96MCd2F9u8Pi_ig-As&ved=0ahUKEwiew9DgxpL9AhXdgv0HHQs8CL8Q4dUDCA8&uact=5&oq=site%3Alinkedin.com%2Fcompany+AND+%22ICT%22+AND+%22Nederland%22&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQA0oECEEYAUoECEYYAFDHKljmQmC2Q2gDcAB4AIABOYgBsQWSAQIxNpgBAKABAcABAQ')
time.sleep(random.uniform(2.5,4.9))
linkedin_url = [my_elem.get_attribute("href")for my_elem in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH,"//div[#class='yuRUbf']")))]
linkedin_url = str(linkedin_url)
lnks.append(linkedin_url)
Somehow the linkedin_url won't work.

Related

How to parse data after specific text Python Selenium bs4

On one of the sites for which I am writing a parser, I encountered the following problem:
I need to take all the data from the table, but they are not signed in the html code and are swapped
html example
The table looks like this:
table
At first I used XPATH for this, but when parsing, I found that some data was swapped, such as engine and registration number, or not at all. So XPATH is not suitable, because data with mileage can get into the line with the engine in the csv file
Is it possible somehow in selenium or through bs4 to first search for a word, and then parse the data after it?
That is, what would find the word Engine in the html code, and then take the data below
html text that I need
My code:
import csv
import time
import schedule
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium_stealth import stealth
def collect_data():
global driver
options = webdriver.ChromeOptions()
options.set_preference('general.useragent.override',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 '
'Safari/537.36')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
# Background mode
# options.add_argument('headless')
try:
driver = webdriver.Chrome(options=options)
stealth(driver,
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
)
driver.get(
url='https://www.nettiauto.com/en/ford/mustang?yfrom=1980'
)
time.sleep(10)
'''Collect all URLs'''
soup = BeautifulSoup(driver.page_source, 'lxml')
car_url_list = []
total_page = soup.find('span', class_='totPage').text
print('Ford Mustang')
print(f'Total pages: {total_page}')
print(f'Page 1 of {total_page} URL collected')
r = (int(total_page) + 1)
count = 1
for i in range(1, r, 1):
driver.get(
url=f'https://www.nettiauto.com/en/ford/mustang?yfrom=1980&page={i}'
)
driver.implicitly_wait(10)
soup = BeautifulSoup(driver.page_source, 'lxml')
car_cards = soup.find_all('a', class_='tricky_link')
count += 1
print(f'Page {count} of {total_page} URL collected')
for car_ulr in car_cards:
car_ulr = car_ulr.get('href')
car_url_list.append(car_ulr)
with open('ford_mustang_url.txt', 'w', encoding='utf8') as file:
for line in car_url_list:
file.write(f'{line}\n')
count = 0
row = []
'''Collect car's data'''
with open('ford_mustang_url.txt', encoding='utf8') as f:
r = len(car_url_list)
print('Total cars: ' + str(r))
for i in range(r):
driver.get(f.readline())
driver.implicitly_wait(30)
soup = BeautifulSoup(driver.page_source, 'lxml')
count += 1
'''Car Data'''
car_name = soup.find('title').text.replace('Nettiauto', '').replace('-', '').replace('Used vehicle', '').replace('Vaihtoauto', '').replace(' ', ' ').strip()
car_price = soup.find('span', class_='GAPrice').find('span').text
car_year = soup.find('div', class_='mid_border').get('data-year')
car_mileage = soup.find('div', class_='mid_border').get('data-mileage')
car_reg_number = soup.find('div', class_='rekkari-banner__body_input').text.strip()
car_url = soup.find('link', hreflang='en').get('href')
# car_engine
'''If section'''
if car_reg_number == 'ABC-123':
car_reg_number = None
if car_mileage == '100000000':
car_mileage = None
print(f'{count}. ' + car_name)
print('Price: ' + f'{car_price}')
print('Year: ' + f'{car_year}')
print('Mileage: ' + f'{car_mileage}')
print('Reg.Number: ' + f'{car_reg_number}')
print('URL: ' + f'{car_url}\n')
data = {
'Name': car_name,
'Price': car_price,
'Year': car_year,
'Mileage': car_mileage,
'Reg.Number': car_reg_number,
'URL': car_url,
}
row.append(data)
csv_title = ['Name', 'Price', 'Year', 'Mileage', 'Reg.Number', 'URL']
with open('ford_mustang.csv', 'w', encoding='utf8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=csv_title)
writer.writeheader()
writer.writerows(row)
except Exception as ex:
print(ex)
finally:
driver.close()
driver.quit()
def main():
collect_data()
if __name__ == '__main__':
main()
I have find some solution with selenium by using if else:
car_engine = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[1]').text
if car_engine == 'Engine':
car_engine = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[2]').text.split(" ", 2)[0]
else:
car_engine = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[1]/td[5]').text.split(" ", 2)[0]
For Drive type it doesn't work, so I did this...
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[4]').text
if drive_type == 'Drive type':
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[5]').text
else:
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[4]').text
if drive_type == 'Drive type':
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[5]').text
else:
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[4]/td[1]').text
if drive_type == 'Drive type':
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[4]/td[2]').text
else:
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[1]').text
if drive_type == 'Drive type':
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[3]/td[2]').text
else:
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[4]').text
if drive_type == 'Drive type':
drive_type = driver.find_element(By.XPATH, '//*[#id="id_adInfo"]/div[1]/div[1]/table/tbody/tr[2]/td[5]').text
else:
pass
Here is a solution for your problem, not based on selenium (it's not the right tool for this job), which will produce a dataframe/csv with all the details you're after:
import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
scraper = cloudscraper.create_scraper()
big_df = pd.DataFrame()
urls_list = []
for x in tqdm(range(1, 8)):
r = scraper.get(f'https://www.nettiauto.com/en/ford/mustang?yfrom=1980&page={x}')
soup = BeautifulSoup(r.text, 'html.parser')
car_links = [x.get('href') for x in soup.select_one('div#listingData').select('a.tricky_link')]
for link in car_links:
urls_list.append(link)
for url in tqdm(urls_list):
r = scraper.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
dfs = pd.read_html(str(r.text))
df_list = []
title = soup.select_one('#heightForSlogan').select_one('h1').get_text(strip=True)
subtitle = soup.select_one('#heightForSlogan').select_one('h2').get_text(strip=True)
df_list.append(('make_model', title))
df_list.append(('variant', subtitle))
for i, row in dfs[0].iterrows():
df_list.append((row[0], row[1]))
df_list.append((row[3], row[4]))
correct_df = pd.DataFrame(df_list).T
new_header = correct_df.iloc[0]
correct_df = correct_df[1:]
correct_df.columns = new_header
big_df = big_df.append(correct_df)
big_df.to_csv('finnish_cars.csv')
A couple of notes: first 2 cars descriptions are in Finnish, the rest are in English, so the end df/csv will be a bit funny, but the data will be there. Also, you might get some warnings in the terminal about pd append/use concat, but those are just warnings, the program will run.
You can install cloudscraper with pip install cloudscraper, and tqdm with pip install tqdm. Of course, if you're keen on using Selenium, you can apply the same methods on html obtained from selenium.

I can't get reviews of each product page in Aliexpress

I'm trying to scrap data from this website: https://www.aliexpress.com/wholesale?catId=0&initiative_id=AS_20220313071939&SearchText=bluetooth+earphones Especially I want to get all reviews from each product page. The main issue is that I'm struggling to get this surrounded bottom in order to scrape each comment and customer country:
Here is a photo showing that:
enter image description here
This is my code :
from selenium import webdriver
from lxml import html
import cssselect
from time import sleep
from itertools import zip_longest
import csv
driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth+earphones&ltype=wholesale&SortType=default&page={}'
with open ("data.csv", "w", encoding="utf-8") as csvfile:
wr = csv.writer(csvfile)
wr.writerow(["Title","Price", "Currency", "Reviews", "Number of orders", "Shipping Cost", "Product links", "Country","Comments"])
for page_nb in range(1, 4):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5)
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[#class="JIIxO"]//a'):
title = product.xpath('.//h1/text()')
if title:
title = title[0]
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
currency = price[0]
price = ''.join(price[1:])
review = product.xpath('.//span[#class="eXPaM"]/text()')
if review:
review = review[0]
else:
review = ''
nb_sold = product.xpath('.//span[#class="_1kNf9"]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = ''
ship_cost = product.xpath('.//span[#class="_2jcMA"]/text()')
if ship_cost:
ship_cost = ship_cost[0]
else:
ship_cost = ''
###########################################
links = product.xpath('//div[#class="JIIxO"]//a/#href')
if links:
links = links[0]
else:
links = ''
# scraping data from each inner page
for link in links :
driver.get(link)
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5)
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
for cmt in tree.xpath('//*[#id="transction-feedback"]/div[5]/div[1]'):
country = cmt.xpath('.//div[#class="user-country"]//b/text()')
if country:
country = country[0]
else:
country = ''
comment = cmt.xpath('.//span[#id="0.0.0.i4.5dc4sSFDsSFD5B"]/text()')
if comment:
comment = comment[0]
else:
comment = ''
row = [title, price, currency, review, nb_sold, ship_cost, links,country, comment]
results.append(row)
print('len(results):', len(results))
wr.writerows(results)
driver.close()
There are two problems:
First:
You have to use html.fromstring(driver.page_source) AFTER you scroll down.
Second:
It adds items only when they are displayed inside window (in viewport) so you can't jump directly to the end of page. You have to scroll partially (in loop) using i.e. window.innerHeight.
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
#print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
Full working code with other changes in xpath.
It gives me 60 items on every page.
from selenium import webdriver
from lxml import html
from time import sleep
from itertools import zip_longest
import csv
driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
#driver = webdriver.Firefox()
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth+earphones&ltype=wholesale&SortType=default&page={}'
with open ("data.csv", "w", encoding="utf-8") as csvfile:
wr = csv.writer(csvfile)
wr.writerow(["Title","Price", "Currency", "Reviews", "Number of orders"])
for page_nb in range(1, 4):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
# jump to the end of page
#driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
# scroll partially
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[#class="JIIxO"]//a'):
title = product.xpath('.//h1/text()')
#print('[DEBUG] title:', title)
if title:
title = title[0]
#print('[DEBUG] title:', title)
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
# for `$ 35.00`
currency = price[0]
price = ''.join(price[1:])
# for `35.00 zł`
#currency = price[-1]
#price = ''.join(price[:-1])
#print('[DEBUG] price:', price)
#print('[DEBUG] currency:', currency)
review = product.xpath('.//span[#class="eXPaM"]/text()')
if review:
review = review[0]
else:
review = ''
#print('[DEBUG] review:', review)
nb_sold = product.xpath('.//span[#class="_1kNf9"]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = ''
#print('[DEBUG] nb_sold:', nb_sold)
row = [title, price, currency, review, nb_sold]
results.append(row)
#print('[DEBUG] row:', row)
print('len(results):', len(results))
wr.writerows(results)
driver.close()

How do you scrape several websites and have one output on the same line with Python using Selenium?

I wanted to learn Python so I started with learning web scraping. I know my code is all over the place but as I learn more I will clean up and make it more efficient. However, I want to know how to print from several different websites and get an output on the same line.
Example:
output : output : output : output : output
Here is my redacted code:
from selenium import webdriver
def bond(x):
driver = webdriver.Chrome()
url = 'website'
driver.get(url)
year_10_bond = driver.find_elements_by_xpath('element')[0].text
print(year_10_bond)
driver.close()
b = bond(print)
def stocks(s):
driver = webdriver.Chrome()
for i in range(0, 7661):
page_num = ('&r=' + str(i * 20 + 1))
url = 'website)
driver.get(url)
tickers = driver.find_elements_by_class_name('element')
company = driver.find_elements_by_xpath('element')
price = driver.find_elements_by_xpath('element')
num_of_tickers = len(tickers)
for i in range(num_of_tickers):
print(tickers[i].text + " : " + company[i].text + " : " + price[i].text)
s = stocks(print)
def outstanding(o)
driver = webdriver.Chrome()
for i in range(0, 7661):
page_num = ('&r=' + str(i * 20 + 1))
url = ('element')
driver.get(url)
shares_outstanding = driver.find_elements_by_xpath('element')
num_of_tickers = len(shares_outstanding)
for i in range(num_of_tickers):
print(shares_outstanding[i].text)
o = outstanding(print)
driver.close()
Here is the solution that worked for me I was just putting this in the wrong place at first:
end = ' '

How to piece my functions together making sure driver variable can be used by all 3

I have 3 functions. Login, Contacts_object and Object_menu. What is the best way to piece them together so I can run them. I am having problems with passing the variable driver to the Contacts_object and Object_menu functions as it is declared in Login.
It is 3 nested for loops. once you get to the end of the 3rd for loop the driver should close and we start again with the next login of the Login function.
I have tried placing driver in a separate .py file and importing it. however this is not working.
def login(Test_Account, Password):
launch url
url = "XXXXXXXX"
for x, y in zip(Test_Account, Password):
driver = webdriver.Chrome()
driver.implicitly_wait(1)
driver.get(url)
username = driver.find_element_by_id("FORMLOGINid")
password = driver.find_element_by_id("FORMLOGINpwd")
username.send_keys(x)
password.send_keys(y)
python_button = driver.find_element_by_id('btSubmit') #
python_button.click() #click
driver.implicitly_wait(1)
def contacts_object(Open_Contacts):
for b in Open_Contacts:
##########################################################
python_menu_dropdown = driver.find_element_by_id('id_arrow_popup_menu')
python_menu_dropdown.click()
time.sleep(1)
############################################################################
python_menu_open = driver.find_element_by_id('id_popup_OPEN')
python_menu_open.click()
time.sleep(1)
x = driver.find_element_by_xpath(b)
x.click()
time.sleep(2)
search_contact = driver.find_element_by_id('id_searchfield')
search_contact.clear()
time.sleep(1)
search_contact = driver.find_element_by_id('id_searchfield')
search_contact.send_keys("Test, Acco")
time.sleep(1)
search_contact.send_keys(Keys.DOWN + Keys.ENTER)
time.sleep(5)
def object_menu(Profile, folders):
for z in folders:
drop_down = driver.find_element_by_id('folder_panels_arrow')
drop_down.click()
if z == len(folders):
break
#print(z)
try:
python_menu_open = driver.find_element_by_xpath(z)
driver.implicitly_wait(10)
if EC.element_to_be_clickable(z):
python_menu_open.click()
driver.implicitly_wait(20)
time.sleep(20)
page = driver.execute_script("return document.body.innerHTML").encode('utf-8') #returns the inner HTML as a string
soup = BeautifulSoup(page, 'html.parser')
soup = soup.prettify()
t = time.localtime()
timestamp = time.strftime('%b-%d-%Y_%H%M', t)
filepath = os.path.join("XXXXXXXXXXXXX")
time.sleep(5)
h = open(filepath, "w")
h.write(soup)
h.close()
except NoSuchElementException:
f = open(r'XXXXXXXXXXXXX')
f.write(str(Profile ) + " not there" +'\n')
f.close()
driver.close()

Incorrect fomat (wrong number of columns) after writing to csv files

My intention is to get the full review of all the profiles along with title of the review, user name, location of the user and the time of posting from the reliance jio reviews web pages of the website and store it in a CSV file.
The website I want to crawl is http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061
When I tried storing the scraped data of first two pages in a CSV file, I got the below output. My problem is the output in each line generate more columns than desired. One sentence is parsed into many cells.
My code :
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import csv
firefox_capabilities = DesiredCapabilities.FIREFOX
firefox_capabilities['marionette'] = True
firefox_capabilities['binary'] = '/etc/firefox'
driver = webdriver.Firefox(capabilities=firefox_capabilities)
url = "http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061"
driver.get(url)
wait = WebDriverWait(driver, 10)
soup=BeautifulSoup(driver.page_source,"lxml")
for items1 in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link1 = items1.find_element_by_css_selector(".reviewdata a")
link1.click()
time.sleep(2)
csv = open('index.csv','w')
column = "Name,Location,Review_data,Review_title,Review_data\n"
csv.write(column)
soup1 = BeautifulSoup(driver.page_source,"lxml")
for item1 in soup1.select(".review-article"):
name1 = item1.select("p a")[0].text
location1 = item1.select("p")[1].text
review_date1 = item1.select("small")[0].text
review_title1 = item1.select("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text
review_data1 = ' '.join([' '.join(items1.text.split()) for items1 in item1.select(".reviewdata")])
print("Name: {}\nLocation : {}\nReview_date: {}\nReview_Title: {}\nReview_Data: {}\n".format(name1, location1, review_date1, review_title1, review_data1))
csv1 = open('index.csv','a')
page1_data = name1 + "," + location1 + "," + review_date1 + "," + review_title1 + "," + review_data1 + "\n"
csv1.write(page1_data)
uclient=uReq(url)
page_html=uclient.read()
uclient.close()
page_soup = soup(page_html,"html.parser")
container = soup.find("ul",{"class":"pages table"})
all_li = container.findAll("li")
last_div = None
for last_div in all_li:pass
if last_div:
content = last_div.getText()
content1 = int(content)
container1 = soup.findAll("li",{"class":"next"})
li=container1[0].find("a",{"class":"btn btn-link"}).attrs['href']
driver.get(li)
wait = WebDriverWait(driver, 10)
soup=BeautifulSoup(driver.page_source,"lxml")
for items in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link = items.find_element_by_css_selector(".reviewdata a")
link.click()
time.sleep(2)
soup = BeautifulSoup(driver.page_source,"lxml")
for item in soup.select(".review-article"):
name = item.select("p a")[0].text
location = item.select("p")[1].text
review_date = item.select("small")[0].text
review_title = item.select("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text
review_data = ' '.join([' '.join(items.text.split()) for items in item.select(".reviewdata")])
print("Name: {}\nLocation : {}\nReview_date: {}\nReview_Title: {}\nReview_Data: {}\n".format(name, location, review_date, review_title, review_data))
csv2 = open("index.csv",'a')
page2_data = name +","+ location+"," + review_date +","+ review_title +","+ review_data + "\n"
csv2.write(page2_data)
driver.quit()
I need help to figure out the error in my code for storing the scraped data into the CSV file in a structured manner.
View your csv file in a text editor. The problem is your spreadsheet program is parsing on BOTH commas and spaces.
Another problem is that you haven't accounted for commas WITHIN your scraped data. That is why you have the city and country in different cells. You will need to put quotation marks around values that have commas within them.
See
page1_data = name1 + "," + location1 + "," + review_date1 + "," + review_title1 + "," + review_data1 + "\n"
csv1.write(page1_data)
There is already comma used in, say, location: Delhi, India. If you keep using comma like you did above, the csv file cannot be parsed properly.
**One workaround is to put "" around your text containing comma. So the Delhi, India would be turned into "\Delhi, India" after this step. **
def preprocess(text):
if "," in text:
return '"' + text + '"'
return text
Wrap each of your text with the function.
page1_data = preprocess(name1) + "," + preprocess(location1) + "," + preprocess(review_date1) + "," + preprocess(review_title1) + "," + preprocess(review_data1) + "\n"
This should work.
Another way should be changing the delimiter to other characters.
You should use csv module because it will automatically resolve problem with comma and "new line"/enter in text.
Create csv writer
f = open('index.csv','w')
csv_writer = csv.writer(f)
and write headers using list, not single string
column = ["Name", "Location", "Review_data", "Review_title", "Review_data"]
csv_writer.writerow(column)
the same way write list with data
row = [name, location, review_date, review_title, review_data]
csv_writer.writerow(row)
Full code
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import csv
# --- init ---
firefox_capabilities = DesiredCapabilities.FIREFOX
firefox_capabilities['marionette'] = True
firefox_capabilities['binary'] = '/etc/firefox'
driver = webdriver.Firefox(capabilities=firefox_capabilities)
url = "http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061"
# --- open file ---
f = open("index.csv", "w")
csv_writer = csv.writer(f)
columns = ["Name", "Location", "Review_data", "Review_title", "Review_data"]
csv_writer.writerow(columns)
# ---- get data ---
driver.get(url)
wait = WebDriverWait(driver, 10)
soup = BeautifulSoup(driver.page_source, "lxml")
for items in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link = items.find_element_by_css_selector(".reviewdata a")
link.click()
time.sleep(2)
soup = BeautifulSoup(driver.page_source, "lxml")
for item in soup.select(".review-article"):
name = item.select("p a")[0].text
location = item.select("p")[1].text
review_date = item.select("small")[0].text
review_title = item.select("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text
review_data = ' '.join([' '.join(items.text.split()) for items in item.select(".reviewdata")])
print("Name:", name)
print("Location:", location)
print("Review_date:", review_date)
print("Review_Title:", review_title)
print("Review_Data:", review_data)
row = [name, location, review_date, review_title, review_data]
csv_writer.writerow(row)
# --- get next url ---
uclient = uReq(url)
page_html = uclient.read()
uclient.close()
soup = BeautifulSoup(page_html, "html.parser")
container = soup.find("ul", {"class": "pages table"})
all_li = container.findAll("li")
if all_li:
last_div = all_li[-1]
content = last_div.getText()
content = int(content)
container = soup.findAll("li", {"class": "next"})
li = container[0].find("a", {"class": "btn btn-link"}).attrs['href']
# ---- get data ---
driver.get(li)
wait = WebDriverWait(driver, 10)
soup = BeautifulSoup(driver.page_source, "lxml")
for items in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link = items.find_element_by_css_selector(".reviewdata a")
link.click()
time.sleep(2)
soup = BeautifulSoup(driver.page_source, "lxml")
for item in soup.select(".review-article"):
name = item.select("p a")[0].text
location = item.select("p")[1].text
review_date = item.select("small")[0].text
review_title = item.select("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text
review_data = ' '.join([' '.join(items.text.split()) for items in item.select(".reviewdata")])
print("Name:", name)
print("Location:", location)
print("Review_date:", review_date)
print("Review_Title:", review_title)
print("Review_Data:", review_data)
row = [name, location, review_date, review_title, review_data]
csv_writer.writerow(row)
# --- end ---
driver.quit()
f.close()
EDIT: version without beautifulsoup and requests - only selenium
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import csv
def get_data(driver, csv_writer):
for item in driver.find_elements_by_css_selector(".review-article"):
name = item.find_elements_by_css_selector("p a")[0].text
location = item.find_elements_by_css_selector("p")[1].text
review_date = item.find_elements_by_css_selector("small")[0].text
review_title = item.find_elements_by_css_selector("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text
review_data = item.find_elements_by_css_selector(".reviewdata")
review_data = ' '.join([' '.join(items.text.split()) for items in review_data])
print("Name:", name)
print("Location:", location)
print("Review_date:", review_date)
print("Review_Title:", review_title)
print("Review_Data:", review_data)
row = [name, location, review_date, review_title, review_data]
csv_writer.writerow(row)
# --- init ---
firefox_capabilities = DesiredCapabilities.FIREFOX
firefox_capabilities['marionette'] = True
firefox_capabilities['binary'] = '/etc/firefox'
driver = webdriver.Firefox(capabilities=firefox_capabilities)
url = "http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061"
# --- open file ---
f = open("index.csv", "w")
csv_writer = csv.writer(f)
columns = ["Name", "Location", "Review_data", "Review_title", "Review_data"]
csv_writer.writerow(columns)
# ---- get data ---
print('url:', url)
driver.get(url)
wait = WebDriverWait(driver, 10)
get_data(driver, csv_writer)
# --- get next url ---
url = driver.find_element_by_xpath('//li[#class="next"]/a').get_attribute("href")
# ---- get data ---
print('url:', url)
driver.get(url)
wait = WebDriverWait(driver, 10)
get_data(driver, csv_writer)
# --- end ---
driver.quit()
f.close()

Categories