Selenium Web Scraping with Headless Web Driver

Selenium Web Scraping with Headless Web Driver - python

I need to scrape a website using selenium. Following is the code for the same:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from selenium import webdriver
import time
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
from xlrd import open_workbook
import logging
import signal
#make lists for all the different aspects needed.
links = []
pics = []
types = []
names = []
descs = []
views = []
no_speakers = []
location = []
dates = []
people = []
organization = []
summ = []
twitter = []
facebook = []
contact = []
emails = []
website_link = []
venue = []
official_address = []
speakers = []
fees = []
at_tr = []
prev_links = []
index = -1
update = []
def main_url(url):
driver = webdriver.Chrome('C:/Program Files/chromedriver.exe')#gets the web driver.
driver.get(url) #gets the URL
time.sleep(5) # wait 5 seconds until DOM will load completly
while True:
try:
driver.find_element_by_id('view_more').click() #clicks on load more until there are no more events to be loaded.
time.sleep(3)
except Exception as e:
break
rows = driver.find_elements_by_class_name('sec_conf_main')
for row in rows:
conf = row.find_element_by_class_name('conf_summery')
nam = conf.find_element_by_class_name('c_name')
name = nam.find_element_by_tag_name('a')
if len(names) != 0 and name.get_attribute('title') in names:
index = names.index(name.get_attribute('title'))
pic = row.find_element_by_class_name('conf_logo')
link = pic.find_element_by_tag_name('a')
if links[index] == link:
pass
else:
links[index] = link.get_attribute('href') #get link of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
img = link.find_element_by_tag_name('img')
if pics[index] == img.get_attribute('src'):
pass
else:
pics[index] = img.get_attribute('src') #picture source of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
desc = row.find_element_by_class_name('conf_desc')
if descs[index] == desc.text:
pass
else:
descs[index] = desc.text #description of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
d = conf.find_elements_by_tag_name('strong')
count = 0
while count < len(d):
view = d[count].text
if views[index] == view:
pass
else:
views[index] = view #number of views.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
if no_speakers[index] == d[count + 1].text:
pass
else:
no_speakers[index] = d[count + 1].text #number of speakers.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
count = count + 2
t = conf.find_elements_by_class_name('spel')
ty = []
for item in t:
ty.append(item.get_attribute('title'))
if types[index] == ','.join(ty):
pass
else:
types[index] = (','.join(ty))#speciality of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
date_place = conf.find_elements_by_class_name('c_summery')
for item in date_place:
try:
if item.find_element_by_tag_name('img'):
if location[index] == item.text:
pass
else:
location[index] = (item.text) #location of event
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
except NoSuchElementException as e:
pass
try:
if item.find_element_by_tag_name('span'):
date = item.text
i = date.find('|')
if dates[index] == date[:i]:
pass
else:
dates[index] = (date[:i]) #date from and to of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
except NoSuchElementException as e:
pass
else:
names.append(name.get_attribute('title')) #title of event.
pic = row.find_element_by_class_name('conf_logo')
link = pic.find_element_by_tag_name('a')
links.append(link.get_attribute('href')) #get link of event.
img = link.find_element_by_tag_name('img')
pics.append(img.get_attribute('src')) #picture source of event.
desc = row.find_element_by_class_name('conf_desc')
descs.append(desc.text) #description of event.
d = conf.find_elements_by_tag_name('strong')
count = 0
while count < len(d):
view = d[count].text
views.append(view) #number of views.
no_speakers.append(d[count + 1].text) #number of speakers.
count = count + 2
t = conf.find_elements_by_class_name('spel')
ty = []
for item in t:
ty.append(item.get_attribute('title'))
types.append(','.join(ty))#speciality of event.
date_place = conf.find_elements_by_class_name('c_summery')
for item in date_place:
try:
if item.find_element_by_tag_name('img'):
location.append(item.text) #location of event
except NoSuchElementException as e:
pass
try:
if item.find_element_by_tag_name('span'):
date = item.text
index = date.find('|')
dates.append(date[:index]) #date from and to of event.
except NoSuchElementException as e:
pass
driver.close()
driver.quit()
def each_event(item):
driver = webdriver.Chrome('C:/Program Files/chromedriver.exe')
driver.get(item) #get each Link of the event.
time.sleep(5)
if len(prev_links) != 0 and item in prev_links:
index = links.index(item)
try:
org = driver.find_element_by_class_name('speakers')
l = org.text.split()
if organization[index] == ' '.join(l[3:]):
pass
else:
organization[index] = (' '.join(l[3:]))
if not item in update:
update.append(item)
except NoSuchElementException as e:
organization[index] = 'No Organization Given.'
try:
summary = driver.find_element_by_class_name('conf_head_summary')
if summ[index] == summary.find_element_by_tag_name('p').text:
pass
else:
summ[index] = (summary.find_element_by_tag_name('p').text)
if not item in update:
update.append(item)
except NoSuchElementException as e:
summ[index] = 'No Conference Summary Given.'
try:
tw = driver.find_element_by_class_name('TW')
if twitter[index] == tw.get_attribute('title'):
pass
else:
twitter[index] = (tw.get_attribute('title'))
if not item in update:
update.append(item)
except NoSuchElementException as e:
twitter[index] = 'No Twitter Link'
try:
fb = driver.find_element_by_class_name('FB')
if facebook[index] == fb.get_attribute('title'):
pass
else:
facebook[index] = (fb.get_attribute('title'))
if not item in update:
update.append(item)
except NoSuchElementException as e:
facebook[index] = ('No Facebook Link')
try:
c = driver.find_element_by_class_name('marB20').find_element_by_xpath('//table/tbody/tr[1]/td[3]').text
if contact[index] == c:
pass
else:
if len(c) == 0:
contact[index] = ('No Contact Number Given.')
else:
contact[index] = (c)
if not item in update:
update.append(item)
except NoSuchElementException as e:
contact[index] = ('No Contact Number Given.')
try:
email = driver.find_elements_by_class_name('emailFruser')
e = []
for item in email:
e.append(item.text)
if emails[index] == ','.join(e):
pass
else:
emails[index] = (','.join(e))
if not item in update:
update.append(item)
except NoSuchElementException as e:
emails[index] = ('No email.')
try:
web = driver.find_element_by_id('cRegistraionpopup5').get_attribute('href')
if website_link[index] == web:
pass
else:
website_link[index] = (web)
if not item in update:
update.append(item)
except NoSuchElementException as e:
website_link[index] = ('No Website Link')
try:
v = driver.find_element_by_class_name('conf_venue1').text
if venue[index] == v:
pass
else:
venue[index] = (v)
if not item in update:
update.append(item)
except NoSuchElementException as e:
venue[index] = ('No Venue Given.')
try:
oa = driver.find_element_by_class_name('hotel-detail').text
if official_address[index] == oa:
pass
else:
official_address[index] = oa
if not item in update:
update.append(item)
except NoSuchElementException as e:
official_address[index] = ('No Official Address Given. ')
try:
sp = driver.find_elements_by_class_name('speaker_single_inn')
l = []
for item in sp:
l.append(driver.find_element_by_xpath('//div/h5/a').text)
if len(l) == 0:
speakers[index] = 'No Speakers'
if speakers[index] == ','.join(l):
pass
else:
speakers[index] = (','.join(l))
if not item in update:
update.append(item)
except NoSuchElementException as e:
speakers[index] = ('No Speakers')
try:
s = driver.find_element_by_class_name('mobScroll')
trs = s.find_elements_by_xpath('//table/tbody/tr')
l = []
for item in trs:
try:
item.find_element_by_class_name('ticketname_inn')
l.append(item.text)
except NoSuchElementException as e:
pass
if fees[index] == ','.join(l):
pass
else:
fees[index] = (';'.join(l))
if not item in update:
update.append(item)
except NoSuchElementException as e:
fees[index] = ('No Fees Given')
try:
sp = driver.find_elements_by_class_name('r-speaker-info')
l = []
for item in sp:
l.append(item.text)
if len(l) == 0:
at_tr[index] = 'No Attenders or Trackers Given.'
if at_tr[index] == ','.join(l):
pass
else:
at_tr[index] = (','.join(l))
if not item in update:
update.append(item)
except NoSuchElementException as e:
at_tr[index] = ('No Attenders or Trackers Given')
else:
try:
org = driver.find_element_by_class_name('speakers')
l = org.text.split()
organization.append(' '.join(l[3:]))
except NoSuchElementException as e:
organization.append('No Organization Given.')
try:
summary = driver.find_element_by_class_name('conf_head_summary')
summ.append(summary.find_element_by_tag_name('p').text)
except NoSuchElementException as e:
summ.append('No Conference Summary Given.')
try:
tw = driver.find_element_by_class_name('TW')
twitter.append(tw.get_attribute('title'))
except:
twitter.append('No Twitter Link')
try:
fb = driver.find_element_by_class_name('FB')
facebook.append(fb.get_attribute('title'))
except:
facebook.append('No Facebook Link')
try:
c = driver.find_element_by_class_name('marB20').find_element_by_xpath('//table/tbody/tr[1]/td[3]').text
if len(c) == 0:
contact.append('No Contact Number Given.')
else:
contact.append(c)
except NoSuchElementException as e:
contact.append('No Contact Number Given.')
try:
email = driver.find_elements_by_class_name('emailFruser')
e = []
for item in email:
e.append(item.text)
emails.append(' '.join(e))
except NoSuchElementException as e:
emails.append('No email.')
try:
website_link.append(driver.find_element_by_id('cRegistraionpopup5').get_attribute('href'))
except NoSuchElementException as e:
website_link.append('No Website Link')
try:
venue.append(driver.find_element_by_class_name('conf_venue1').text)
except NoSuchElementException as e:
venue.append('No Venue Given.')
try:
official_address.append(driver.find_element_by_class_name('hotel-detail').text)
except NoSuchElementException as e:
official_address.append('No Official Address Given. ')
try:
sp = driver.find_elements_by_class_name('speaker_single_inn')
l = []
for item in sp:
l.append(driver.find_element_by_xpath('//div/h5/a').text)
if len(l) == 0:
speakers.append('No Speakers Given.')
else:
speakers.append(','.join(l))
except NoSuchElementException as e:
speakers.append('No Speakers')
try:
s = driver.find_element_by_class_name('mobScroll')
trs = s.find_elements_by_xpath('//table/tbody/tr')
l = []
for item in trs:
try:
item.find_element_by_class_name('ticketname_inn')
l.append(item.text)
except NoSuchElementException as e:
pass
fees.append(';'.join(l))
except NoSuchElementException as e:
fees.append('No Fees Given')
try:
sp = driver.find_elements_by_class_name('r-speaker-info')
l = []
for item in sp:
l.append(item.text)
if len(l) == 0:
at_tr.append('No Attenders or Trackers Given')
else:
at_tr.append(','.join(l))
except NoSuchElementException as e:
at_tr.append('No Attenders or Trackers Given')
driver.close()
driver.quit()
def main():
file = 'EMedEvents.xlsx' #file to write in
book = open_workbook(file)
sheet = book.sheet_by_index(0)
d = pd.read_excel(file)
if d.empty:
pass
else:
for row in range(1, sheet.nrows):
names.append(sheet.cell(row, 0).value)
dates.append(sheet.cell(row, 1).value)
types.append(sheet.cell(row, 2).value)
location.append(sheet.cell(row, 3).value)
descs.append(sheet.cell(row, 4).value)
views.append(sheet.cell(row, 5).value)
no_speakers.append(sheet.cell(row, 6).value)
pics.append(sheet.cell(row, 7).value)
links.append(sheet.cell(row, 8).value)
organization.append(sheet.cell(row, 9).value)
summ.append(sheet.cell(row, 10).value)
twitter.append(sheet.cell(row, 11).value)
facebook.append(sheet.cell(row, 12).value)
contact.append(sheet.cell(row, 13).value)
emails.append(sheet.cell(row, 14).value)
website_link.append(sheet.cell(row, 15).value)
venue.append(sheet.cell(row, 16).value)
official_address.append(sheet.cell(row, 17).value)
speakers.append(sheet.cell(row, 18).value)
fees.append(sheet.cell(row, 19).value)
at_tr.append(sheet.cell(row, 20).value)
if len(links) != 0:
for item in links:
prev_links.append(item)
main_url("https://www.emedevents.com/india-medical-conferences") #main url to use.
for item in links:
each_event(item) #get people information of each event.
df = pd.DataFrame.from_dict({'Event Name':names,'Event Dates':dates, 'Specialty' : types,'Event Location' : location, 'Description' : descs,
'Views' : views, 'Speakers' : no_speakers, 'Picture Source' : pics, 'Event Link' : links, 'Organized By' : organization,
'Conference Summary' : summ, 'Twitter Link' : twitter, 'Facebook Link' : facebook,'Contact Number' : contact,
'Email' : emails, 'Website Link' : website_link, 'Venue' : venue, 'Official Address' : official_address, 'Speaking' : speakers,
'Fees' : fees, 'Attenders and Trackers': at_tr})
df.to_excel(file, header=True, index=False) #print the data in the excel sheet.
logging.basicConfig(filename = 'error_' + str(time.time()) + '.log', level = logging.INFO)
logging.info('%d events were read from the excel sheet', len(prev_links))
logging.info('%d events were added to the excel sheet', len(links) - len(prev_links))
logging.info('Following are the links of the events that were updated:')
for item in update:
logging.info(item)
if __name__ == '__main__':
main() #if the name is main, run the main method and continue with the program.
The program works on a Windows System with the Chrome exe downloaded and its path provided in the code. I want to make this work on an Ubuntu Linux based system which does not use any exe's.
I know I can use headless drivers to make this work.
I tried making the following change:
driver = webdriver.PhantomJS()
The change does not work properly. It gives me the following warning:
UserWarning: Selenium support for PhantomJS has been deprecated, please use headless versions of Chrome or Firefox instead
warnings.warn('Selenium support for PhantomJS has been deprecated, please use headless '
I do not understand what to do now. What changes do I make in my code so it works on an Ubuntu platform.
Thanks in Advance.

Related

problem with xpath i'm getting invalid url trying to scrap

I have the following problem with my code. I'm trying to scrap information of the next link
sepomex
I'm trying to get the Estado, Municipio and a list of cp (código postal btw) but I'm getting this:
enter image description here
and this is my code:
import os
import datetime
import requests
import lxml.html as html
HOME_URL = 'https://www.correosdemexico.gob.mx/SSLServicios/ConsultaCP/Descarga.aspx'
XPATH_ESTADOS = '//select[#id="DdlEstado"]/option[#value > 0]/text()'
XPATH_MUNICIPIOS = '//select[#id="DdlMunicipio"]/option[#value > 0]/text()'
XPATH_LOCALIDAD = '//table[#class="small"]/tbody/tr[#class="dgNormal"]/td/text()'
def parse_edos(estado, today):
try:
response = requests.get(estado)
if response.status_code == 200:
root = html.fromstring(response.content)
try:
municipios = root.xpath(XPATH_MUNICIPIOS)
for municipio in municipios:
localidad = root.xpath(XPATH_LOCALIDAD)
except IndexError:
return
with open(f'{today}/{estado}.txt', 'w', encoding='utf-8') as f:
for i in localidad:
f.write(i + '\n')
else:
raise ValueError(f'Error: {response.status_code}')
except ValueError as err:
print(err)
def main():
try:
response = requests.get(HOME_URL)
if response.status_code == 200:
root = html.fromstring(response.content)
estados = root.xpath(XPATH_ESTADOS)
today = datetime.datetime.now().strftime('%Y-%m-%d')
if not os.path.isdir(today):
os.mkdir(today)
for estado in estados:
parse_edos(estado, today)
else:
raise ValueError(f'Error: {response.status_code}')
except ValueError as err:
print(err)
def run():
main()
if __name__ == '__main__':
run()
srry about my kewl english :P

requests_html stop website from redirecting

I am trying to scrape the follow link https://9anime.to/watch/one-piece-dub.34r/r2wjlq using python/requests_html.
My problem is it gets auto redirected to the default server tab instead of the mp4upload tab, trying to find a fix for this but cant figure it out.
Below is the code
import re
import requests
import cloudscraper
from urllib import parse
from bs4 import BeautifulSoup
from requests_html import HTMLSession
base_url = 'https://9anime.to'
class nine_scraper:
def get_ep_links(url):
html = nine_scraper.get_html(url, True)
servers = html.find('div', id='servers-container')
if servers:
results = []
mp4upload_results = []
mp4upload = servers.find('div', attrs={'data-id': '35'})
mp4upload_eps = mp4upload.find_all('a', href=True)
for ep in mp4upload_eps:
x = (ep.get('href'), ep.text)
mp4upload_results.append(x)
for result in mp4upload_results:
results.append(base_url + result[0])
return results
else:
print('No servers found!!')
def get_series_info(url):
return
def get_servers(html):
return
def find_download(url):
html = nine_scraper.get_html(url, True)
def search(query):
if '&page=' in query:
query = query.split('&page=')
search_url = base_url + '/search?keyword=' + parse.quote(query[0]) + '&page=' + query[1]
else:
search_url = base_url + '/search?keyword=' + parse.quote(query)
html = nine_scraper.get_html(search_url, False)
film_list = html.find('div', class_='film-list')
if film_list:
results = []
prev_page = html.find('a', class_='pull-left')
next_page = html.find('a', class_='pull-right')
films = film_list.find_all('div', class_='inner')
for film in films:
results.append((film.find('a', class_='name').text.strip(), film.find('a', class_='name').get('href').strip()))
if prev_page.get('href'):
param = parse.urlsplit(base_url + '/' + prev_page.get('href')).query
url = parse.unquote_plus(param.replace('keyword=', ''), encoding='utf-8')
results.append(('Previous page', url))
if next_page.get('href'):
param = parse.urlsplit(base_url + '/' + next_page.get('href')).query
url = parse.unquote_plus(param.replace('keyword=', ''), encoding='utf-8')
results.append(('Next page', url))
return results
else:
print('No results found!')
def get_html(url, render_js=False): # Load webpage and return its html
try:
if render_js: # Check if page needs to render javascript, if so use 'requests_html'
session = HTMLSession() # Make a GET request to your webpage, using 'Requests'
resp = session.get(url, timeout=10)
resp.raise_for_status() # Raise an exception if respones doesnt come back 200-400
resp.html.render(timeout=10) # Render the javascript
html = BeautifulSoup(resp.html.html, 'html.parser') # Parse the html data we just got with 'BeautifulSoup4'
return html # Return the parsed html
else: # Use 'cloudscraper' since we dont need to load any javascript
c_scraper = cloudscraper.create_scraper() # Make a GET request to your webpage, using 'Requests'
resp = c_scraper.get(url)
resp.raise_for_status() # Raise an exception if respones doesnt come back 200-400
html = BeautifulSoup(resp.content, 'html.parser') # Parse the html data we just got with 'BeautifulSoup4'
return html # Return the parsed html
except requests.HTTPError as e:
print(f'HTTP error occurred: {e}')
except requests.ConnectionError as e:
print(f'Connection Error occurred: {e}')
except requests.Timeout as e:
print(f'Timeout Error occurred: {e}')
except requests.RequestException as e:
print(f'General Error occurred: {e}')
except Exception as e:
print(f'Other error occurred: {e}')
except KeyboardInterrupt:
print("Someone closed the program")
import sys
from os import system, name
from scrapers import nine_scraper
def screen_clear():
# for mac and linux(os.name is 'posix')
if name == 'nt':
_ = system('cls')
else:
_ = system('clear')
def main_menu():
while True:
screen_clear()
print('------9anime downloader------\n[1] Search \n[2] Download \n[3] Exit\n-----------------------------\n')
main_choice = input('Enter your choice [1-3] >')
if main_choice == '1':
search_menu()
break
elif main_choice == '2':
continue
elif main_choice == '3':
screen_clear()
sys.exit()
else:
continue
def search_menu(query=False):
screen_clear()
print('--------------9anime downloader/search--------------\n')
if query:
search_results = nine_scraper.search(query)
results_menu(search_results)
else:
query = input('Please enter the name of the anime >')
if query:
search_results = nine_scraper.search(query)
results_menu(search_results)
def results_menu(results):
for num, result in enumerate(results, 1):
title = result[0]
link = result[1]
if 'Previous page' not in title:
if 'Next page' in title:
n = True
print('[N] ' + title)
else:
print(f'[{num}] {title}')
else:
p = True
print('[P] ' + title)
print('[M] Main menu')
titles, links = map(list, zip(*results))
while True:
search_choice = input('Enter choice >')
try:
search_choice = int(search_choice)
if 1 <= search_choice <= len(results) + 1:
print(links[search_choice - 1])
print(titles[search_choice - 1])
ep_links = nine_scraper.get_ep_links(links[search_choice - 1])
for link in ep_links:
print(link)
nine_scraper.find_download(link)
# series_menu(links[search_choice - 1])
break
except ValueError:
if search_choice.lower() == 'm':
main_menu()
break
elif search_choice.lower() == 'p':
if p:
url = links[-2]
search_menu(url)
break
continue
elif search_choice.lower() == 'n':
if n:
url = links.pop()
search_menu(url)
break
continue
def series_menu(url):
info = nine_scraper.get_series_info()
main_menu()
I know it has to be some javascript that is redirecting the page but i cant figure out what i need to do in order to stop that, any help would be very appreciated!

Using requests_html you can set allow_redirects=False like this:
r = session.get(url,allow_redirects=False)
Now your request should go only to the requested URL.

How to execute single function in multiple thread and thread instance create in loop in python?

1) i have a list of product links and it contain 3385 links
2) i have a function get_pro_info(link) it take link of product and append item to the json file.
3) i want selenium open 5 browser and 5 link parallel and get information of product and append in a file or list..
or 3) selenium open 1 browser and 5 tab(having 5 links) and append file.
Question how can i apply threading on my code?
my code...
new_url=''
def get_pro_info(pro_url):
driver = webdriver.Chrome(executable_path=r'C:\Users\Beenu\PycharmProjects/chromedriver.exe')
try:
new_url = 'https://pk.studiobytcs.com' + pro_url
print('new product URL: ' + new_url)
driver.execute_script("window.open('');")
sleep(1)
# use to switch control
driver.switch_to.window(driver.window_handles[0])
# sleep(1)
driver.get(new_url)
except(WebDriverException, selenium.common.exceptions.TimeoutException, Exception) as e:
print('There is error in getting Product by URL in get_pro_info()! \n' + str(e.stacktrace))
pass
description_source_code = ''
# description_soup = BeautifulSoup()
description_soup: BeautifulSoup = object
# global description_soup
try:
# description_soup = BeautifulSoup('html.parser')
description: WebElement = driver.find_element_by_xpath(
'//*[#id="shopify-section-product-template"]/div[2]/div[1]/div/div[2]')
description_source_code = description.get_attribute("innerHTML")
description_soup: BeautifulSoup = BeautifulSoup(description_source_code, 'html.parser')
except NoSuchElementException as e:
print('Product description taag not found! \n' + str(e.stacktrace))
pass
# 179 here
# This is for getting heading product name
head = ''
r_j_title = ''
try:
head = description_soup.find_all("h1", class_="product_name")
# print(head)
r_j_title = head[0].string.strip()
print("Title: " + r_j_title)
except (HTMLParser, IndexError):
print('Fail to get heading/title Tag! \n' + str(HTMLParser))
# This is for get brand name from heading/title
r_j_brand_and_designer = ''
try:
brand_and_designer = head[0].string.strip().split("-")[0]
r_j_brand_and_designer = str(brand_and_designer).strip()
print('Brand and designer: ' + r_j_brand_and_designer)
except (IndexError, ValueError) as e:
print('Fail to Split Brand from heading/title ! \n' + str(e.stacktrace))
# This is for getting price in integer
r_j_price_in_int = ''
try:
price = description_soup.find_all("span", class_="money")
# print(price)
price_new = price[0].string.strip()
print("New price: " + price_new)
# this is for getting price from string
r_c_price = price[0].string.strip().split(".")[1]
r_j_price_in_int = str(r_c_price).replace(",", "")
# price could ha ,
print('Price: ' + r_j_price_in_int)
except (HTMLParser, IndexError, ValueError) as e:
print('Fail to get Tag or failed to Split Brand from heading/title ! \n' + str(e.stacktrace))
# this is for getting full description
description_all = ''
r_j_desc = ''
try:
description_all = description_soup.find_all("div", class_="description")
final_des = str(description_all[0].get_text())
ch = final_des.split()
r_j_desc = str(' '.join(ch))
print("with split ch : " + r_j_desc) # addtion of .string.strip()
except (HTMLParser, IndexError, ValueError) as e:
print('Fail to get all description Tag or failed to Split and removing endline chr from description ! \n' + str(
e.stacktrace))
# This is for trying if fibric tag is not avaliable
try:
get_split_fibric = description_all[0].get_text().split("Fabric", 1)[1]
get_split_des = get_split_fibric.split("Disclaimer")[0]
r_j_fabric = str(get_split_des).strip()
print("getting fibric: " + r_j_fabric)
except IndexError as e:
r_j_fabric = 'N/A'
print('Fabric is not avaliable: ' + r_j_fabric)
item['brand_name'] = str(r_j_brand_and_designer)
item['designer'] = str(r_j_brand_and_designer)
item['title'] = str(r_j_title)
item['description'] = str(r_j_desc)
item['price'] = int(r_j_price_in_int)
item['currency'] = "PKR"
item['product_id'] = str(r_j_title)
item['source'] = str(new_url)
item['fabric'] = str(r_j_fabric)
item['gender'] = "woman"
print(item)
cloth = {
"cloth": item
}
# instruction
print(cloth)
list_before_dump.append(cloth)
driver.close()
driver.quit()
with open('product_link_read.txt', 'r') as file:
data = file.readlines()
# rd_pro_link_list=rd_pro_link_list+data.replace('\n', '')
print(data)
for line in data:
# fap=
rd_pro_link_list.append(str(line).strip())
print(rd_pro_link_list)
print(len(rd_pro_link_list))
for pro_link in rd_pro_link_list:
get_pro_info(pro_link)
print('Pro count = ' + str(pro_count))
pro_count = pro_count + 1
list_before_dump_file.write(json.dumps(list_before_dump))
driver.close()
list_before_dump_file.close()

if you want to iterate list and get always 20 links then you can use range(start, stop, step) with step=20
all_t = []
for i in range(0, len(list_of_product_link), 20):
twenty_links = list_of_product_link[i:i+20]
t = threading.Thread(target=get_product_info, args=(twenty_links,))
t.start()
all_t.append(t)
# --- later ---
for t in all_t:
t.join()
or
for i in range(0, len(list_of_product_link), 20):
twenty_links = list_of_product_link[i:i+20]
all_t = []
for link in twenty_links:
t = threading.Thread(target=get_product_info, args=(link,))
t.start()
all_t.append(t)
# --- inside first `for` loop ---
for t in all_t:
t.join()
Other method is good if you will no need later your list
all_t = []
while list_of_product_link:
twenty_links = list_of_product_link[:20]
list_of_product_link = list_of_product_link[20:]
t = threading.Thread(target=get_product_info, args=(twenty_links,))
t.start()
all_t.append(t)
# --- later ---
for t in all_t:
t.join()
or
while list_of_product_link:
twenty_links = list_of_product_link[:20]
list_of_product_link = list_of_product_link[20:]
all_t = []
for link in twenty_links:
t = threading.Thread(target=get_product_info, args=(link,))
t.start()
all_t.append(t)
# --- inside first `for` loop ---
for t in all_t:
t.join()
BTW: args= needs tuple - even if you have only one arguments so you need , in ( ) to create tuple with one element.
BTW: If you want it to run only 20 threads in every moment then better see multiprocessing and Pool(20)
from multiprocessing import Pool
def get_product_info(link):
result = ....
return result
if __name__ == '__main__':
with Pool(20) as p:
all_results = p.map(get_product_info, list_of_product_link)

write data column wise in a MongoDB document

I have made a scraper which scrapes data from a website. My code as of now writes in an excel file. It also reads and updates the excel file. My code first reads the excel database to make sure it updates the present information in the excel sheet and if there is some new information in the website which is not in the excel database, it is added to the sheet.
Following is the code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from selenium import webdriver
import time
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
from xlrd import open_workbook
from selenium.webdriver.chrome.options import Options
import logging
#make lists for all the different aspects needed.
links = []
pics = []
types = []
names = []
descs = []
views = []
no_speakers = []
location = []
dates = []
people = []
organization = []
summ = []
twitter = []
facebook = []
contact = []
emails = []
website_link = []
venue = []
official_address = []
speakers = []
fees = []
at_tr = []
prev_links = []
index = -1
update = []
def main_url(url):
options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=options)
driver.get(url) #gets the URL
time.sleep(5) # wait 5 seconds until DOM will load completly
while True:
try:
driver.find_element_by_id('view_more').click() #clicks on load more until there are no more events to be loaded.
time.sleep(3)
except Exception as e:
break
rows = driver.find_elements_by_class_name('sec_conf_main')
for row in rows:
conf = row.find_element_by_class_name('conf_summery')
nam = conf.find_element_by_class_name('c_name')
name = nam.find_element_by_tag_name('a')
if len(names) != 0 and name.get_attribute('title') in names:
index = names.index(name.get_attribute('title'))
pic = row.find_element_by_class_name('conf_logo')
link = pic.find_element_by_tag_name('a')
if links[index] == link:
pass
else:
links[index] = link.get_attribute('href') #get link of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
img = link.find_element_by_tag_name('img')
if pics[index] == img.get_attribute('src'):
pass
else:
pics[index] = img.get_attribute('src') #picture source of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
desc = row.find_element_by_class_name('conf_desc')
if descs[index] == desc.text:
pass
else:
descs[index] = desc.text #description of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
d = conf.find_elements_by_tag_name('strong')
count = 0
while count < len(d):
view = d[count].text
if views[index] == view:
pass
else:
views[index] = view #number of views.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
if no_speakers[index] == d[count + 1].text:
pass
else:
no_speakers[index] = d[count + 1].text #number of speakers.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
count = count + 2
t = conf.find_elements_by_class_name('spel')
ty = []
for item in t:
ty.append(item.get_attribute('title'))
if types[index] == ','.join(ty):
pass
else:
types[index] = (','.join(ty))#speciality of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
date_place = conf.find_elements_by_class_name('c_summery')
for item in date_place:
try:
if item.find_element_by_tag_name('img'):
if location[index] == item.text:
pass
else:
location[index] = (item.text) #location of event
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
except NoSuchElementException as e:
pass
try:
if item.find_element_by_tag_name('span'):
date = item.text
i = date.find('|')
if dates[index] == date[:i]:
pass
else:
dates[index] = (date[:i]) #date from and to of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
except NoSuchElementException as e:
pass
else:
names.append(name.get_attribute('title')) #title of event.
pic = row.find_element_by_class_name('conf_logo')
link = pic.find_element_by_tag_name('a')
links.append(link.get_attribute('href')) #get link of event.
img = link.find_element_by_tag_name('img')
pics.append(img.get_attribute('src')) #picture source of event.
desc = row.find_element_by_class_name('conf_desc')
descs.append(desc.text) #description of event.
d = conf.find_elements_by_tag_name('strong')
count = 0
while count < len(d):
view = d[count].text
views.append(view) #number of views.
no_speakers.append(d[count + 1].text) #number of speakers.
count = count + 2
t = conf.find_elements_by_class_name('spel')
ty = []
for item in t:
ty.append(item.get_attribute('title'))
types.append(','.join(ty))#speciality of event.
date_place = conf.find_elements_by_class_name('c_summery')
for item in date_place:
try:
if item.find_element_by_tag_name('img'):
location.append(item.text) #location of event
except NoSuchElementException as e:
pass
try:
if item.find_element_by_tag_name('span'):
date = item.text
index = date.find('|')
dates.append(date[:index]) #date from and to of event.
except NoSuchElementException as e:
pass
driver.close()
driver.quit()
def each_event(item):
options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=options)
try:
driver.get(item) #get each Link of the event.
time.sleep(5)
if len(prev_links) != 0 and item in prev_links:
index = links.index(item)
try:
org = driver.find_element_by_class_name('speakers')
l = org.text.split()
if organization[index] == ' '.join(l[3:]):
pass
else:
organization[index] = (' '.join(l[3:]))
if not item in update:
update.append(item)
except NoSuchElementException as e:
organization[index] = 'No Organization Given.'
try:
summary = driver.find_element_by_class_name('conf_head_summary')
if summ[index] == summary.find_element_by_tag_name('p').text:
pass
else:
summ[index] = (summary.find_element_by_tag_name('p').text)
if not item in update:
update.append(item)
except NoSuchElementException as e:
summ[index] = 'No Conference Summary Given.'
try:
tw = driver.find_element_by_class_name('TW')
if twitter[index] == tw.get_attribute('title'):
pass
else:
twitter[index] = (tw.get_attribute('title'))
if not item in update:
update.append(item)
except NoSuchElementException as e:
twitter[index] = 'No Twitter Link'
try:
fb = driver.find_element_by_class_name('FB')
if facebook[index] == fb.get_attribute('title'):
pass
else:
facebook[index] = (fb.get_attribute('title'))
if not item in update:
update.append(item)
except NoSuchElementException as e:
facebook[index] = ('No Facebook Link')
try:
c = driver.find_element_by_class_name('marB20').find_element_by_xpath('//table/tbody/tr[1]/td[3]').text
if contact[index] == c:
pass
else:
if len(c) == 0:
contact[index] = ('No Contact Number Given.')
else:
contact[index] = (c)
if not item in update:
update.append(item)
except NoSuchElementException as e:
contact[index] = ('No Contact Number Given.')
try:
email = driver.find_elements_by_class_name('emailFruser')
e = []
for item in email:
e.append(item.text)
if emails[index] == ','.join(e):
pass
else:
emails[index] = (','.join(e))
if not item in update:
update.append(item)
except NoSuchElementException as e:
emails[index] = ('No email.')
try:
web = driver.find_element_by_id('cRegistraionpopup5').get_attribute('href')
if website_link[index] == web:
pass
else:
website_link[index] = (web)
if not item in update:
update.append(item)
except NoSuchElementException as e:
website_link[index] = ('No Website Link')
try:
v = driver.find_element_by_class_name('conf_venue1').text
if venue[index] == v:
pass
else:
venue[index] = (v)
if not item in update:
update.append(item)
except NoSuchElementException as e:
venue[index] = ('No Venue Given.')
try:
oa = driver.find_element_by_class_name('hotel-detail').text
if official_address[index] == oa:
pass
else:
official_address[index] = oa
if not item in update:
update.append(item)
except NoSuchElementException as e:
official_address[index] = ('No Official Address Given. ')
try:
sp = driver.find_elements_by_class_name('speaker_single_inn')
l = []
for item in sp:
l.append(driver.find_element_by_xpath('//div/h5/a').text)
if len(l) == 0:
speakers[index] = 'No Speakers'
if speakers[index] == ','.join(l):
pass
else:
speakers[index] = (','.join(l))
if not item in update:
update.append(item)
except NoSuchElementException as e:
speakers[index] = ('No Speakers')
try:
s = driver.find_element_by_class_name('mobScroll')
trs = s.find_elements_by_xpath('//table/tbody/tr')
l = []
for item in trs:
try:
item.find_element_by_class_name('ticketname_inn')
l.append(item.text)
except NoSuchElementException as e:
pass
if fees[index] == ','.join(l):
pass
else:
fees[index] = (';'.join(l))
if not item in update:
update.append(item)
except NoSuchElementException as e:
fees[index] = ('No Fees Given')
try:
sp = driver.find_elements_by_class_name('r-speaker-info')
l = []
for item in sp:
l.append(item.text)
if len(l) == 0:
at_tr[index] = 'No Attenders or Trackers Given.'
if at_tr[index] == ','.join(l):
pass
else:
at_tr[index] = (','.join(l))
if not item in update:
update.append(item)
except NoSuchElementException as e:
at_tr[index] = ('No Attenders or Trackers Given')
else:
try:
org = driver.find_element_by_class_name('speakers')
l = org.text.split()
organization.append(' '.join(l[3:]))
except NoSuchElementException as e:
organization.append('No Organization Given.')
try:
summary = driver.find_element_by_class_name('conf_head_summary')
summ.append(summary.find_element_by_tag_name('p').text)
except NoSuchElementException as e:
summ.append('No Conference Summary Given.')
try:
tw = driver.find_element_by_class_name('TW')
twitter.append(tw.get_attribute('title'))
except:
twitter.append('No Twitter Link')
try:
fb = driver.find_element_by_class_name('FB')
facebook.append(fb.get_attribute('title'))
except:
facebook.append('No Facebook Link')
try:
c = driver.find_element_by_class_name('marB20').find_element_by_xpath('//table/tbody/tr[1]/td[3]').text
if len(c) == 0:
contact.append('No Contact Number Given.')
else:
contact.append(c)
except NoSuchElementException as e:
contact.append('No Contact Number Given.')
try:
email = driver.find_elements_by_class_name('emailFruser')
e = []
for item in email:
e.append(item.text)
emails.append(' '.join(e))
except NoSuchElementException as e:
emails.append('No email.')
try:
website_link.append(driver.find_element_by_id('cRegistraionpopup5').get_attribute('href'))
except NoSuchElementException as e:
website_link.append('No Website Link')
try:
venue.append(driver.find_element_by_class_name('conf_venue1').text)
except NoSuchElementException as e:
venue.append('No Venue Given.')
try:
official_address.append(driver.find_element_by_class_name('hotel-detail').text)
except NoSuchElementException as e:
official_address.append('No Official Address Given. ')
try:
sp = driver.find_elements_by_class_name('speaker_single_inn')
l = []
for item in sp:
l.append(driver.find_element_by_xpath('//div/h5/a').text)
if len(l) == 0:
speakers.append('No Speakers Given.')
else:
speakers.append(','.join(l))
except NoSuchElementException as e:
speakers.append('No Speakers')
try:
s = driver.find_element_by_class_name('mobScroll')
trs = s.find_elements_by_xpath('//table/tbody/tr')
l = []
for item in trs:
try:
item.find_element_by_class_name('ticketname_inn')
l.append(item.text)
except NoSuchElementException as e:
pass
fees.append(';'.join(l))
except NoSuchElementException as e:
fees.append('No Fees Given')
try:
sp = driver.find_elements_by_class_name('r-speaker-info')
l = []
for item in sp:
l.append(item.text)
if len(l) == 0:
at_tr.append('No Attenders or Trackers Given')
else:
at_tr.append(','.join(l))
except NoSuchElementException as e:
at_tr.append('No Attenders or Trackers Given')
driver.close()
driver.quit()
except Exception as e:
pass
def main():
file = 'EMedEvents.xlsx' #file to write in
book = open_workbook(file)
sheet = book.sheet_by_index(0)
d = pd.read_excel(file)
if d.empty:
pass
else:
for row in range(1, sheet.nrows):
names.append(sheet.cell(row, 0).value)
dates.append(sheet.cell(row, 1).value)
types.append(sheet.cell(row, 2).value)
location.append(sheet.cell(row, 3).value)
descs.append(sheet.cell(row, 4).value)
views.append(sheet.cell(row, 5).value)
no_speakers.append(sheet.cell(row, 6).value)
pics.append(sheet.cell(row, 7).value)
links.append(sheet.cell(row, 8).value)
organization.append(sheet.cell(row, 9).value)
summ.append(sheet.cell(row, 10).value)
twitter.append(sheet.cell(row, 11).value)
facebook.append(sheet.cell(row, 12).value)
contact.append(sheet.cell(row, 13).value)
emails.append(sheet.cell(row, 14).value)
website_link.append(sheet.cell(row, 15).value)
venue.append(sheet.cell(row, 16).value)
official_address.append(sheet.cell(row, 17).value)
speakers.append(sheet.cell(row, 18).value)
fees.append(sheet.cell(row, 19).value)
at_tr.append(sheet.cell(row, 20).value)
if len(links) != 0:
for item in links:
prev_links.append(item)
main_url("https://www.emedevents.com/india-medical-conferences") #main url to use.
for item in links:
each_event(item) #get people information of each event.
df = pd.DataFrame.from_dict({'Event Name':names,'Event Dates':dates, 'Specialty' : types,'Event Location' : location, 'Description' : descs,
'Views' : views, 'Speakers' : no_speakers, 'Picture Source' : pics, 'Event Link' : links, 'Organized By' : organization,
'Conference Summary' : summ, 'Twitter Link' : twitter, 'Facebook Link' : facebook,'Contact Number' : contact,
'Email' : emails, 'Website Link' : website_link, 'Venue' : venue, 'Official Address' : official_address, 'Speaking' : speakers,
'Fees' : fees, 'Attenders and Trackers': at_tr})
df.to_excel(file, header=True, index=False) #print the data in the excel sheet.
logging.basicConfig(filename = 'error_' + str(time.time()) + '.log', level = logging.INFO)
logging.info('%d events were read from the excel sheet', len(prev_links))
logging.info('%d events were added to the excel sheet', len(links) - len(prev_links))
logging.info('Following are the links of the events that were updated:')
for item in update:
logging.info(item)
if __name__ == '__main__':
main() #if the name is main, run the main method and continue with the program.
I need all of this functionality done in MongoDB rather than in Excel. I am totally new to MongoDB, so I am not understanding any step to help myself out in the same.
Any help would be awesome.
Thanks in Advance.

You should probably use some sort of mongo API for python, like:
https://github.com/mongodb/mongo-python-driver/blob/master/README.rst
If you don't know anything about mongo at all, begin with basic tutorial, then use the API to write the values to your database:
https://www.tutorialspoint.com/mongodb/mongodb_tutorial.pdf

Scraping Instagram with API ?__a=1

I've been trying scraping Instagram posts for a certain hashtag for the keys: display_url, taken_at_timestamp, text, edge_liked_by. This goes perfect for some hundreds in the start, but then stops fetching 'text' keyword only. Other three fields are successfully fetched though. I am not sure why it's happening.
I am parsing the JSON https://www.instagram.com/explore/tags/something/?__a=1.
base_url = "https://www.instagram.com/explore/tags/salonedelmobile/?__a=1"
url = "https://www.instagram.com/explore/tags/salonedelmobile/?__a=1"
while True:
response = url_req.urlopen(url)
json_file = json.load(response)
for i in json_file['graphql']['hashtag']['edge_hashtag_to_media']['edges']:
try:
post_text = i['node']['edge_media_to_caption']['edges'][0]['node']['text']
except IndexError as e:
post_text = e
try:
display_url = i['node']['display_url']
except:
display_url = e
try:
like_count = i['node']['edge_liked_by']['count']
except:
like_count = e
try:
time_stamp = i['node']['taken_at_timestamp']
except:
time_stamp = e
output.append([display_url, like_count, time_stamp, post_text])
df = pd.DataFrame(output,columns=['URL', 'Like Count', 'Time', 'Text'])
try:
df.to_excel('instagram.xlsx')
except:
pass
if json_file['graphql']['hashtag']['edge_hashtag_to_media']['page_info']['has_next_page'] == True:
end_cursor = json_file['graphql']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor']
url = base_url + '&max_id=' + end_cursor
else:
break

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Selenium Web Scraping with Headless Web Driver - python

Related

problem with xpath i'm getting invalid url trying to scrap

requests_html stop website from redirecting

How to execute single function in multiple thread and thread instance create in loop in python?

write data column wise in a MongoDB document

Scraping Instagram with API ?__a=1

Categories

Resources