problem with xpath i'm getting invalid url trying to scrap - python

I have the following problem with my code. I'm trying to scrap information of the next link
sepomex
I'm trying to get the Estado, Municipio and a list of cp (código postal btw) but I'm getting this:
enter image description here
and this is my code:
import os
import datetime
import requests
import lxml.html as html
HOME_URL = 'https://www.correosdemexico.gob.mx/SSLServicios/ConsultaCP/Descarga.aspx'
XPATH_ESTADOS = '//select[#id="DdlEstado"]/option[#value > 0]/text()'
XPATH_MUNICIPIOS = '//select[#id="DdlMunicipio"]/option[#value > 0]/text()'
XPATH_LOCALIDAD = '//table[#class="small"]/tbody/tr[#class="dgNormal"]/td/text()'
def parse_edos(estado, today):
try:
response = requests.get(estado)
if response.status_code == 200:
root = html.fromstring(response.content)
try:
municipios = root.xpath(XPATH_MUNICIPIOS)
for municipio in municipios:
localidad = root.xpath(XPATH_LOCALIDAD)
except IndexError:
return
with open(f'{today}/{estado}.txt', 'w', encoding='utf-8') as f:
for i in localidad:
f.write(i + '\n')
else:
raise ValueError(f'Error: {response.status_code}')
except ValueError as err:
print(err)
def main():
try:
response = requests.get(HOME_URL)
if response.status_code == 200:
root = html.fromstring(response.content)
estados = root.xpath(XPATH_ESTADOS)
today = datetime.datetime.now().strftime('%Y-%m-%d')
if not os.path.isdir(today):
os.mkdir(today)
for estado in estados:
parse_edos(estado, today)
else:
raise ValueError(f'Error: {response.status_code}')
except ValueError as err:
print(err)
def run():
main()
if __name__ == '__main__':
run()
srry about my kewl english :P

Related

requests_html stop website from redirecting

I am trying to scrape the follow link https://9anime.to/watch/one-piece-dub.34r/r2wjlq using python/requests_html.
My problem is it gets auto redirected to the default server tab instead of the mp4upload tab, trying to find a fix for this but cant figure it out.
Below is the code
import re
import requests
import cloudscraper
from urllib import parse
from bs4 import BeautifulSoup
from requests_html import HTMLSession
base_url = 'https://9anime.to'
class nine_scraper:
def get_ep_links(url):
html = nine_scraper.get_html(url, True)
servers = html.find('div', id='servers-container')
if servers:
results = []
mp4upload_results = []
mp4upload = servers.find('div', attrs={'data-id': '35'})
mp4upload_eps = mp4upload.find_all('a', href=True)
for ep in mp4upload_eps:
x = (ep.get('href'), ep.text)
mp4upload_results.append(x)
for result in mp4upload_results:
results.append(base_url + result[0])
return results
else:
print('No servers found!!')
def get_series_info(url):
return
def get_servers(html):
return
def find_download(url):
html = nine_scraper.get_html(url, True)
def search(query):
if '&page=' in query:
query = query.split('&page=')
search_url = base_url + '/search?keyword=' + parse.quote(query[0]) + '&page=' + query[1]
else:
search_url = base_url + '/search?keyword=' + parse.quote(query)
html = nine_scraper.get_html(search_url, False)
film_list = html.find('div', class_='film-list')
if film_list:
results = []
prev_page = html.find('a', class_='pull-left')
next_page = html.find('a', class_='pull-right')
films = film_list.find_all('div', class_='inner')
for film in films:
results.append((film.find('a', class_='name').text.strip(), film.find('a', class_='name').get('href').strip()))
if prev_page.get('href'):
param = parse.urlsplit(base_url + '/' + prev_page.get('href')).query
url = parse.unquote_plus(param.replace('keyword=', ''), encoding='utf-8')
results.append(('Previous page', url))
if next_page.get('href'):
param = parse.urlsplit(base_url + '/' + next_page.get('href')).query
url = parse.unquote_plus(param.replace('keyword=', ''), encoding='utf-8')
results.append(('Next page', url))
return results
else:
print('No results found!')
def get_html(url, render_js=False): # Load webpage and return its html
try:
if render_js: # Check if page needs to render javascript, if so use 'requests_html'
session = HTMLSession() # Make a GET request to your webpage, using 'Requests'
resp = session.get(url, timeout=10)
resp.raise_for_status() # Raise an exception if respones doesnt come back 200-400
resp.html.render(timeout=10) # Render the javascript
html = BeautifulSoup(resp.html.html, 'html.parser') # Parse the html data we just got with 'BeautifulSoup4'
return html # Return the parsed html
else: # Use 'cloudscraper' since we dont need to load any javascript
c_scraper = cloudscraper.create_scraper() # Make a GET request to your webpage, using 'Requests'
resp = c_scraper.get(url)
resp.raise_for_status() # Raise an exception if respones doesnt come back 200-400
html = BeautifulSoup(resp.content, 'html.parser') # Parse the html data we just got with 'BeautifulSoup4'
return html # Return the parsed html
except requests.HTTPError as e:
print(f'HTTP error occurred: {e}')
except requests.ConnectionError as e:
print(f'Connection Error occurred: {e}')
except requests.Timeout as e:
print(f'Timeout Error occurred: {e}')
except requests.RequestException as e:
print(f'General Error occurred: {e}')
except Exception as e:
print(f'Other error occurred: {e}')
except KeyboardInterrupt:
print("Someone closed the program")
import sys
from os import system, name
from scrapers import nine_scraper
def screen_clear():
# for mac and linux(os.name is 'posix')
if name == 'nt':
_ = system('cls')
else:
_ = system('clear')
def main_menu():
while True:
screen_clear()
print('------9anime downloader------\n[1] Search \n[2] Download \n[3] Exit\n-----------------------------\n')
main_choice = input('Enter your choice [1-3] >')
if main_choice == '1':
search_menu()
break
elif main_choice == '2':
continue
elif main_choice == '3':
screen_clear()
sys.exit()
else:
continue
def search_menu(query=False):
screen_clear()
print('--------------9anime downloader/search--------------\n')
if query:
search_results = nine_scraper.search(query)
results_menu(search_results)
else:
query = input('Please enter the name of the anime >')
if query:
search_results = nine_scraper.search(query)
results_menu(search_results)
def results_menu(results):
for num, result in enumerate(results, 1):
title = result[0]
link = result[1]
if 'Previous page' not in title:
if 'Next page' in title:
n = True
print('[N] ' + title)
else:
print(f'[{num}] {title}')
else:
p = True
print('[P] ' + title)
print('[M] Main menu')
titles, links = map(list, zip(*results))
while True:
search_choice = input('Enter choice >')
try:
search_choice = int(search_choice)
if 1 <= search_choice <= len(results) + 1:
print(links[search_choice - 1])
print(titles[search_choice - 1])
ep_links = nine_scraper.get_ep_links(links[search_choice - 1])
for link in ep_links:
print(link)
nine_scraper.find_download(link)
# series_menu(links[search_choice - 1])
break
except ValueError:
if search_choice.lower() == 'm':
main_menu()
break
elif search_choice.lower() == 'p':
if p:
url = links[-2]
search_menu(url)
break
continue
elif search_choice.lower() == 'n':
if n:
url = links.pop()
search_menu(url)
break
continue
def series_menu(url):
info = nine_scraper.get_series_info()
main_menu()
I know it has to be some javascript that is redirecting the page but i cant figure out what i need to do in order to stop that, any help would be very appreciated!
Using requests_html you can set allow_redirects=False like this:
r = session.get(url,allow_redirects=False)
Now your request should go only to the requested URL.

multithreading for loop not working in Python with no errors

I have put together the below and wanted to test multithreading.
I am trying to make the for loop run threaded, so several URLs in the list can be processed in parallel.
This script doesn't error, but it doesn't do anything and I am not sure why.
If I remove the multithreading pieces, it works fine
Can anyone help me?
import multiprocessing.dummy as mp
import requests
import pandas as pd
import datetime
urls = [
'http://google.co.uk',
'http://bbc.co.uk/'
]
def do_print(s):
check_data = pd.DataFrame([])
now = datetime.datetime.now()
try:
response = requests.get(url)
except:
response = 'null'
try:
response_code = response.status_code
except:
response_code = 'null'
try:
response_content = response.content
except:
response_content = 'null'
try:
response_text = response.text
except:
response_text = 'null'
try:
response_content_type = response.headers['Content-Type']
except:
response_content_type = 'null'
try:
response_server = response.headers['Server']
except:
response_server = 'null'
try:
response_last_modified = response.headers['Last-Modified']
except:
response_last_modified = 'null'
try:
response_content_encoding = response.headers['Content-Encoding']
except:
response_content_encoding = 'null'
try:
response_content_length = response.headers['Content-Length']
except:
response_content_length = 'null'
try:
response_url = response.url
except:
response_url = 'null'
if int(response_code) <400:
availability = 'OK'
elif int(response_code) >399 and int(response_code) < 500:
availability = 'Client Error'
elif int(response_code) >499:
availability = 'Server Error'
if int(response_code) <400:
availability_score = 1
elif int(response_code) >399 and int(response_code) < 500:
availability_score = 0
elif int(response_code) >499:
availability_score = 0
d = {'check_time': [now], 'code': [response_code], 'type': [response_content_type], 'url': [response_url], 'server': [response_server], 'modified': [response_last_modified], 'encoding': [response_content_encoding], 'availability': [availability], 'availability_score': [availability_score]}
df = pd.DataFrame(data=d)
check_data = check_data.append(df ,ignore_index=True,sort=False)
if __name__=="__main__":
p=mp.Pool(4)
p.map(do_print, urls)
p.close()
p.join()
When I run code I get error because it try to convert int("null") - all because you have
except:
response_code = 'null'`
If I use except Exception as ex: print(ex) then I get error that variable url doesn't exists. And it is true because you have def do_print(s): but it should be def do_print(url):
BTW: instead of 'null' you could use standard None and later check if response_code: before you try to covnert it to integer. Or simply skip rest of code when you get error.
Other problem - process should use return df and you should get it
results = p.map(...)
and then use results to create DataFrame check_data

write data column wise in a MongoDB document

I have made a scraper which scrapes data from a website. My code as of now writes in an excel file. It also reads and updates the excel file. My code first reads the excel database to make sure it updates the present information in the excel sheet and if there is some new information in the website which is not in the excel database, it is added to the sheet.
Following is the code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from selenium import webdriver
import time
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
from xlrd import open_workbook
from selenium.webdriver.chrome.options import Options
import logging
#make lists for all the different aspects needed.
links = []
pics = []
types = []
names = []
descs = []
views = []
no_speakers = []
location = []
dates = []
people = []
organization = []
summ = []
twitter = []
facebook = []
contact = []
emails = []
website_link = []
venue = []
official_address = []
speakers = []
fees = []
at_tr = []
prev_links = []
index = -1
update = []
def main_url(url):
options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=options)
driver.get(url) #gets the URL
time.sleep(5) # wait 5 seconds until DOM will load completly
while True:
try:
driver.find_element_by_id('view_more').click() #clicks on load more until there are no more events to be loaded.
time.sleep(3)
except Exception as e:
break
rows = driver.find_elements_by_class_name('sec_conf_main')
for row in rows:
conf = row.find_element_by_class_name('conf_summery')
nam = conf.find_element_by_class_name('c_name')
name = nam.find_element_by_tag_name('a')
if len(names) != 0 and name.get_attribute('title') in names:
index = names.index(name.get_attribute('title'))
pic = row.find_element_by_class_name('conf_logo')
link = pic.find_element_by_tag_name('a')
if links[index] == link:
pass
else:
links[index] = link.get_attribute('href') #get link of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
img = link.find_element_by_tag_name('img')
if pics[index] == img.get_attribute('src'):
pass
else:
pics[index] = img.get_attribute('src') #picture source of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
desc = row.find_element_by_class_name('conf_desc')
if descs[index] == desc.text:
pass
else:
descs[index] = desc.text #description of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
d = conf.find_elements_by_tag_name('strong')
count = 0
while count < len(d):
view = d[count].text
if views[index] == view:
pass
else:
views[index] = view #number of views.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
if no_speakers[index] == d[count + 1].text:
pass
else:
no_speakers[index] = d[count + 1].text #number of speakers.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
count = count + 2
t = conf.find_elements_by_class_name('spel')
ty = []
for item in t:
ty.append(item.get_attribute('title'))
if types[index] == ','.join(ty):
pass
else:
types[index] = (','.join(ty))#speciality of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
date_place = conf.find_elements_by_class_name('c_summery')
for item in date_place:
try:
if item.find_element_by_tag_name('img'):
if location[index] == item.text:
pass
else:
location[index] = (item.text) #location of event
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
except NoSuchElementException as e:
pass
try:
if item.find_element_by_tag_name('span'):
date = item.text
i = date.find('|')
if dates[index] == date[:i]:
pass
else:
dates[index] = (date[:i]) #date from and to of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
except NoSuchElementException as e:
pass
else:
names.append(name.get_attribute('title')) #title of event.
pic = row.find_element_by_class_name('conf_logo')
link = pic.find_element_by_tag_name('a')
links.append(link.get_attribute('href')) #get link of event.
img = link.find_element_by_tag_name('img')
pics.append(img.get_attribute('src')) #picture source of event.
desc = row.find_element_by_class_name('conf_desc')
descs.append(desc.text) #description of event.
d = conf.find_elements_by_tag_name('strong')
count = 0
while count < len(d):
view = d[count].text
views.append(view) #number of views.
no_speakers.append(d[count + 1].text) #number of speakers.
count = count + 2
t = conf.find_elements_by_class_name('spel')
ty = []
for item in t:
ty.append(item.get_attribute('title'))
types.append(','.join(ty))#speciality of event.
date_place = conf.find_elements_by_class_name('c_summery')
for item in date_place:
try:
if item.find_element_by_tag_name('img'):
location.append(item.text) #location of event
except NoSuchElementException as e:
pass
try:
if item.find_element_by_tag_name('span'):
date = item.text
index = date.find('|')
dates.append(date[:index]) #date from and to of event.
except NoSuchElementException as e:
pass
driver.close()
driver.quit()
def each_event(item):
options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=options)
try:
driver.get(item) #get each Link of the event.
time.sleep(5)
if len(prev_links) != 0 and item in prev_links:
index = links.index(item)
try:
org = driver.find_element_by_class_name('speakers')
l = org.text.split()
if organization[index] == ' '.join(l[3:]):
pass
else:
organization[index] = (' '.join(l[3:]))
if not item in update:
update.append(item)
except NoSuchElementException as e:
organization[index] = 'No Organization Given.'
try:
summary = driver.find_element_by_class_name('conf_head_summary')
if summ[index] == summary.find_element_by_tag_name('p').text:
pass
else:
summ[index] = (summary.find_element_by_tag_name('p').text)
if not item in update:
update.append(item)
except NoSuchElementException as e:
summ[index] = 'No Conference Summary Given.'
try:
tw = driver.find_element_by_class_name('TW')
if twitter[index] == tw.get_attribute('title'):
pass
else:
twitter[index] = (tw.get_attribute('title'))
if not item in update:
update.append(item)
except NoSuchElementException as e:
twitter[index] = 'No Twitter Link'
try:
fb = driver.find_element_by_class_name('FB')
if facebook[index] == fb.get_attribute('title'):
pass
else:
facebook[index] = (fb.get_attribute('title'))
if not item in update:
update.append(item)
except NoSuchElementException as e:
facebook[index] = ('No Facebook Link')
try:
c = driver.find_element_by_class_name('marB20').find_element_by_xpath('//table/tbody/tr[1]/td[3]').text
if contact[index] == c:
pass
else:
if len(c) == 0:
contact[index] = ('No Contact Number Given.')
else:
contact[index] = (c)
if not item in update:
update.append(item)
except NoSuchElementException as e:
contact[index] = ('No Contact Number Given.')
try:
email = driver.find_elements_by_class_name('emailFruser')
e = []
for item in email:
e.append(item.text)
if emails[index] == ','.join(e):
pass
else:
emails[index] = (','.join(e))
if not item in update:
update.append(item)
except NoSuchElementException as e:
emails[index] = ('No email.')
try:
web = driver.find_element_by_id('cRegistraionpopup5').get_attribute('href')
if website_link[index] == web:
pass
else:
website_link[index] = (web)
if not item in update:
update.append(item)
except NoSuchElementException as e:
website_link[index] = ('No Website Link')
try:
v = driver.find_element_by_class_name('conf_venue1').text
if venue[index] == v:
pass
else:
venue[index] = (v)
if not item in update:
update.append(item)
except NoSuchElementException as e:
venue[index] = ('No Venue Given.')
try:
oa = driver.find_element_by_class_name('hotel-detail').text
if official_address[index] == oa:
pass
else:
official_address[index] = oa
if not item in update:
update.append(item)
except NoSuchElementException as e:
official_address[index] = ('No Official Address Given. ')
try:
sp = driver.find_elements_by_class_name('speaker_single_inn')
l = []
for item in sp:
l.append(driver.find_element_by_xpath('//div/h5/a').text)
if len(l) == 0:
speakers[index] = 'No Speakers'
if speakers[index] == ','.join(l):
pass
else:
speakers[index] = (','.join(l))
if not item in update:
update.append(item)
except NoSuchElementException as e:
speakers[index] = ('No Speakers')
try:
s = driver.find_element_by_class_name('mobScroll')
trs = s.find_elements_by_xpath('//table/tbody/tr')
l = []
for item in trs:
try:
item.find_element_by_class_name('ticketname_inn')
l.append(item.text)
except NoSuchElementException as e:
pass
if fees[index] == ','.join(l):
pass
else:
fees[index] = (';'.join(l))
if not item in update:
update.append(item)
except NoSuchElementException as e:
fees[index] = ('No Fees Given')
try:
sp = driver.find_elements_by_class_name('r-speaker-info')
l = []
for item in sp:
l.append(item.text)
if len(l) == 0:
at_tr[index] = 'No Attenders or Trackers Given.'
if at_tr[index] == ','.join(l):
pass
else:
at_tr[index] = (','.join(l))
if not item in update:
update.append(item)
except NoSuchElementException as e:
at_tr[index] = ('No Attenders or Trackers Given')
else:
try:
org = driver.find_element_by_class_name('speakers')
l = org.text.split()
organization.append(' '.join(l[3:]))
except NoSuchElementException as e:
organization.append('No Organization Given.')
try:
summary = driver.find_element_by_class_name('conf_head_summary')
summ.append(summary.find_element_by_tag_name('p').text)
except NoSuchElementException as e:
summ.append('No Conference Summary Given.')
try:
tw = driver.find_element_by_class_name('TW')
twitter.append(tw.get_attribute('title'))
except:
twitter.append('No Twitter Link')
try:
fb = driver.find_element_by_class_name('FB')
facebook.append(fb.get_attribute('title'))
except:
facebook.append('No Facebook Link')
try:
c = driver.find_element_by_class_name('marB20').find_element_by_xpath('//table/tbody/tr[1]/td[3]').text
if len(c) == 0:
contact.append('No Contact Number Given.')
else:
contact.append(c)
except NoSuchElementException as e:
contact.append('No Contact Number Given.')
try:
email = driver.find_elements_by_class_name('emailFruser')
e = []
for item in email:
e.append(item.text)
emails.append(' '.join(e))
except NoSuchElementException as e:
emails.append('No email.')
try:
website_link.append(driver.find_element_by_id('cRegistraionpopup5').get_attribute('href'))
except NoSuchElementException as e:
website_link.append('No Website Link')
try:
venue.append(driver.find_element_by_class_name('conf_venue1').text)
except NoSuchElementException as e:
venue.append('No Venue Given.')
try:
official_address.append(driver.find_element_by_class_name('hotel-detail').text)
except NoSuchElementException as e:
official_address.append('No Official Address Given. ')
try:
sp = driver.find_elements_by_class_name('speaker_single_inn')
l = []
for item in sp:
l.append(driver.find_element_by_xpath('//div/h5/a').text)
if len(l) == 0:
speakers.append('No Speakers Given.')
else:
speakers.append(','.join(l))
except NoSuchElementException as e:
speakers.append('No Speakers')
try:
s = driver.find_element_by_class_name('mobScroll')
trs = s.find_elements_by_xpath('//table/tbody/tr')
l = []
for item in trs:
try:
item.find_element_by_class_name('ticketname_inn')
l.append(item.text)
except NoSuchElementException as e:
pass
fees.append(';'.join(l))
except NoSuchElementException as e:
fees.append('No Fees Given')
try:
sp = driver.find_elements_by_class_name('r-speaker-info')
l = []
for item in sp:
l.append(item.text)
if len(l) == 0:
at_tr.append('No Attenders or Trackers Given')
else:
at_tr.append(','.join(l))
except NoSuchElementException as e:
at_tr.append('No Attenders or Trackers Given')
driver.close()
driver.quit()
except Exception as e:
pass
def main():
file = 'EMedEvents.xlsx' #file to write in
book = open_workbook(file)
sheet = book.sheet_by_index(0)
d = pd.read_excel(file)
if d.empty:
pass
else:
for row in range(1, sheet.nrows):
names.append(sheet.cell(row, 0).value)
dates.append(sheet.cell(row, 1).value)
types.append(sheet.cell(row, 2).value)
location.append(sheet.cell(row, 3).value)
descs.append(sheet.cell(row, 4).value)
views.append(sheet.cell(row, 5).value)
no_speakers.append(sheet.cell(row, 6).value)
pics.append(sheet.cell(row, 7).value)
links.append(sheet.cell(row, 8).value)
organization.append(sheet.cell(row, 9).value)
summ.append(sheet.cell(row, 10).value)
twitter.append(sheet.cell(row, 11).value)
facebook.append(sheet.cell(row, 12).value)
contact.append(sheet.cell(row, 13).value)
emails.append(sheet.cell(row, 14).value)
website_link.append(sheet.cell(row, 15).value)
venue.append(sheet.cell(row, 16).value)
official_address.append(sheet.cell(row, 17).value)
speakers.append(sheet.cell(row, 18).value)
fees.append(sheet.cell(row, 19).value)
at_tr.append(sheet.cell(row, 20).value)
if len(links) != 0:
for item in links:
prev_links.append(item)
main_url("https://www.emedevents.com/india-medical-conferences") #main url to use.
for item in links:
each_event(item) #get people information of each event.
df = pd.DataFrame.from_dict({'Event Name':names,'Event Dates':dates, 'Specialty' : types,'Event Location' : location, 'Description' : descs,
'Views' : views, 'Speakers' : no_speakers, 'Picture Source' : pics, 'Event Link' : links, 'Organized By' : organization,
'Conference Summary' : summ, 'Twitter Link' : twitter, 'Facebook Link' : facebook,'Contact Number' : contact,
'Email' : emails, 'Website Link' : website_link, 'Venue' : venue, 'Official Address' : official_address, 'Speaking' : speakers,
'Fees' : fees, 'Attenders and Trackers': at_tr})
df.to_excel(file, header=True, index=False) #print the data in the excel sheet.
logging.basicConfig(filename = 'error_' + str(time.time()) + '.log', level = logging.INFO)
logging.info('%d events were read from the excel sheet', len(prev_links))
logging.info('%d events were added to the excel sheet', len(links) - len(prev_links))
logging.info('Following are the links of the events that were updated:')
for item in update:
logging.info(item)
if __name__ == '__main__':
main() #if the name is main, run the main method and continue with the program.
I need all of this functionality done in MongoDB rather than in Excel. I am totally new to MongoDB, so I am not understanding any step to help myself out in the same.
Any help would be awesome.
Thanks in Advance.
You should probably use some sort of mongo API for python, like:
https://github.com/mongodb/mongo-python-driver/blob/master/README.rst
If you don't know anything about mongo at all, begin with basic tutorial, then use the API to write the values to your database:
https://www.tutorialspoint.com/mongodb/mongodb_tutorial.pdf

Selenium Web Scraping with Headless Web Driver

I need to scrape a website using selenium. Following is the code for the same:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from selenium import webdriver
import time
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
from xlrd import open_workbook
import logging
import signal
#make lists for all the different aspects needed.
links = []
pics = []
types = []
names = []
descs = []
views = []
no_speakers = []
location = []
dates = []
people = []
organization = []
summ = []
twitter = []
facebook = []
contact = []
emails = []
website_link = []
venue = []
official_address = []
speakers = []
fees = []
at_tr = []
prev_links = []
index = -1
update = []
def main_url(url):
driver = webdriver.Chrome('C:/Program Files/chromedriver.exe')#gets the web driver.
driver.get(url) #gets the URL
time.sleep(5) # wait 5 seconds until DOM will load completly
while True:
try:
driver.find_element_by_id('view_more').click() #clicks on load more until there are no more events to be loaded.
time.sleep(3)
except Exception as e:
break
rows = driver.find_elements_by_class_name('sec_conf_main')
for row in rows:
conf = row.find_element_by_class_name('conf_summery')
nam = conf.find_element_by_class_name('c_name')
name = nam.find_element_by_tag_name('a')
if len(names) != 0 and name.get_attribute('title') in names:
index = names.index(name.get_attribute('title'))
pic = row.find_element_by_class_name('conf_logo')
link = pic.find_element_by_tag_name('a')
if links[index] == link:
pass
else:
links[index] = link.get_attribute('href') #get link of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
img = link.find_element_by_tag_name('img')
if pics[index] == img.get_attribute('src'):
pass
else:
pics[index] = img.get_attribute('src') #picture source of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
desc = row.find_element_by_class_name('conf_desc')
if descs[index] == desc.text:
pass
else:
descs[index] = desc.text #description of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
d = conf.find_elements_by_tag_name('strong')
count = 0
while count < len(d):
view = d[count].text
if views[index] == view:
pass
else:
views[index] = view #number of views.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
if no_speakers[index] == d[count + 1].text:
pass
else:
no_speakers[index] = d[count + 1].text #number of speakers.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
count = count + 2
t = conf.find_elements_by_class_name('spel')
ty = []
for item in t:
ty.append(item.get_attribute('title'))
if types[index] == ','.join(ty):
pass
else:
types[index] = (','.join(ty))#speciality of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
date_place = conf.find_elements_by_class_name('c_summery')
for item in date_place:
try:
if item.find_element_by_tag_name('img'):
if location[index] == item.text:
pass
else:
location[index] = (item.text) #location of event
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
except NoSuchElementException as e:
pass
try:
if item.find_element_by_tag_name('span'):
date = item.text
i = date.find('|')
if dates[index] == date[:i]:
pass
else:
dates[index] = (date[:i]) #date from and to of event.
if not link.get_attribute('href') in update:
update.append(link.get_attribute('href'))
except NoSuchElementException as e:
pass
else:
names.append(name.get_attribute('title')) #title of event.
pic = row.find_element_by_class_name('conf_logo')
link = pic.find_element_by_tag_name('a')
links.append(link.get_attribute('href')) #get link of event.
img = link.find_element_by_tag_name('img')
pics.append(img.get_attribute('src')) #picture source of event.
desc = row.find_element_by_class_name('conf_desc')
descs.append(desc.text) #description of event.
d = conf.find_elements_by_tag_name('strong')
count = 0
while count < len(d):
view = d[count].text
views.append(view) #number of views.
no_speakers.append(d[count + 1].text) #number of speakers.
count = count + 2
t = conf.find_elements_by_class_name('spel')
ty = []
for item in t:
ty.append(item.get_attribute('title'))
types.append(','.join(ty))#speciality of event.
date_place = conf.find_elements_by_class_name('c_summery')
for item in date_place:
try:
if item.find_element_by_tag_name('img'):
location.append(item.text) #location of event
except NoSuchElementException as e:
pass
try:
if item.find_element_by_tag_name('span'):
date = item.text
index = date.find('|')
dates.append(date[:index]) #date from and to of event.
except NoSuchElementException as e:
pass
driver.close()
driver.quit()
def each_event(item):
driver = webdriver.Chrome('C:/Program Files/chromedriver.exe')
driver.get(item) #get each Link of the event.
time.sleep(5)
if len(prev_links) != 0 and item in prev_links:
index = links.index(item)
try:
org = driver.find_element_by_class_name('speakers')
l = org.text.split()
if organization[index] == ' '.join(l[3:]):
pass
else:
organization[index] = (' '.join(l[3:]))
if not item in update:
update.append(item)
except NoSuchElementException as e:
organization[index] = 'No Organization Given.'
try:
summary = driver.find_element_by_class_name('conf_head_summary')
if summ[index] == summary.find_element_by_tag_name('p').text:
pass
else:
summ[index] = (summary.find_element_by_tag_name('p').text)
if not item in update:
update.append(item)
except NoSuchElementException as e:
summ[index] = 'No Conference Summary Given.'
try:
tw = driver.find_element_by_class_name('TW')
if twitter[index] == tw.get_attribute('title'):
pass
else:
twitter[index] = (tw.get_attribute('title'))
if not item in update:
update.append(item)
except NoSuchElementException as e:
twitter[index] = 'No Twitter Link'
try:
fb = driver.find_element_by_class_name('FB')
if facebook[index] == fb.get_attribute('title'):
pass
else:
facebook[index] = (fb.get_attribute('title'))
if not item in update:
update.append(item)
except NoSuchElementException as e:
facebook[index] = ('No Facebook Link')
try:
c = driver.find_element_by_class_name('marB20').find_element_by_xpath('//table/tbody/tr[1]/td[3]').text
if contact[index] == c:
pass
else:
if len(c) == 0:
contact[index] = ('No Contact Number Given.')
else:
contact[index] = (c)
if not item in update:
update.append(item)
except NoSuchElementException as e:
contact[index] = ('No Contact Number Given.')
try:
email = driver.find_elements_by_class_name('emailFruser')
e = []
for item in email:
e.append(item.text)
if emails[index] == ','.join(e):
pass
else:
emails[index] = (','.join(e))
if not item in update:
update.append(item)
except NoSuchElementException as e:
emails[index] = ('No email.')
try:
web = driver.find_element_by_id('cRegistraionpopup5').get_attribute('href')
if website_link[index] == web:
pass
else:
website_link[index] = (web)
if not item in update:
update.append(item)
except NoSuchElementException as e:
website_link[index] = ('No Website Link')
try:
v = driver.find_element_by_class_name('conf_venue1').text
if venue[index] == v:
pass
else:
venue[index] = (v)
if not item in update:
update.append(item)
except NoSuchElementException as e:
venue[index] = ('No Venue Given.')
try:
oa = driver.find_element_by_class_name('hotel-detail').text
if official_address[index] == oa:
pass
else:
official_address[index] = oa
if not item in update:
update.append(item)
except NoSuchElementException as e:
official_address[index] = ('No Official Address Given. ')
try:
sp = driver.find_elements_by_class_name('speaker_single_inn')
l = []
for item in sp:
l.append(driver.find_element_by_xpath('//div/h5/a').text)
if len(l) == 0:
speakers[index] = 'No Speakers'
if speakers[index] == ','.join(l):
pass
else:
speakers[index] = (','.join(l))
if not item in update:
update.append(item)
except NoSuchElementException as e:
speakers[index] = ('No Speakers')
try:
s = driver.find_element_by_class_name('mobScroll')
trs = s.find_elements_by_xpath('//table/tbody/tr')
l = []
for item in trs:
try:
item.find_element_by_class_name('ticketname_inn')
l.append(item.text)
except NoSuchElementException as e:
pass
if fees[index] == ','.join(l):
pass
else:
fees[index] = (';'.join(l))
if not item in update:
update.append(item)
except NoSuchElementException as e:
fees[index] = ('No Fees Given')
try:
sp = driver.find_elements_by_class_name('r-speaker-info')
l = []
for item in sp:
l.append(item.text)
if len(l) == 0:
at_tr[index] = 'No Attenders or Trackers Given.'
if at_tr[index] == ','.join(l):
pass
else:
at_tr[index] = (','.join(l))
if not item in update:
update.append(item)
except NoSuchElementException as e:
at_tr[index] = ('No Attenders or Trackers Given')
else:
try:
org = driver.find_element_by_class_name('speakers')
l = org.text.split()
organization.append(' '.join(l[3:]))
except NoSuchElementException as e:
organization.append('No Organization Given.')
try:
summary = driver.find_element_by_class_name('conf_head_summary')
summ.append(summary.find_element_by_tag_name('p').text)
except NoSuchElementException as e:
summ.append('No Conference Summary Given.')
try:
tw = driver.find_element_by_class_name('TW')
twitter.append(tw.get_attribute('title'))
except:
twitter.append('No Twitter Link')
try:
fb = driver.find_element_by_class_name('FB')
facebook.append(fb.get_attribute('title'))
except:
facebook.append('No Facebook Link')
try:
c = driver.find_element_by_class_name('marB20').find_element_by_xpath('//table/tbody/tr[1]/td[3]').text
if len(c) == 0:
contact.append('No Contact Number Given.')
else:
contact.append(c)
except NoSuchElementException as e:
contact.append('No Contact Number Given.')
try:
email = driver.find_elements_by_class_name('emailFruser')
e = []
for item in email:
e.append(item.text)
emails.append(' '.join(e))
except NoSuchElementException as e:
emails.append('No email.')
try:
website_link.append(driver.find_element_by_id('cRegistraionpopup5').get_attribute('href'))
except NoSuchElementException as e:
website_link.append('No Website Link')
try:
venue.append(driver.find_element_by_class_name('conf_venue1').text)
except NoSuchElementException as e:
venue.append('No Venue Given.')
try:
official_address.append(driver.find_element_by_class_name('hotel-detail').text)
except NoSuchElementException as e:
official_address.append('No Official Address Given. ')
try:
sp = driver.find_elements_by_class_name('speaker_single_inn')
l = []
for item in sp:
l.append(driver.find_element_by_xpath('//div/h5/a').text)
if len(l) == 0:
speakers.append('No Speakers Given.')
else:
speakers.append(','.join(l))
except NoSuchElementException as e:
speakers.append('No Speakers')
try:
s = driver.find_element_by_class_name('mobScroll')
trs = s.find_elements_by_xpath('//table/tbody/tr')
l = []
for item in trs:
try:
item.find_element_by_class_name('ticketname_inn')
l.append(item.text)
except NoSuchElementException as e:
pass
fees.append(';'.join(l))
except NoSuchElementException as e:
fees.append('No Fees Given')
try:
sp = driver.find_elements_by_class_name('r-speaker-info')
l = []
for item in sp:
l.append(item.text)
if len(l) == 0:
at_tr.append('No Attenders or Trackers Given')
else:
at_tr.append(','.join(l))
except NoSuchElementException as e:
at_tr.append('No Attenders or Trackers Given')
driver.close()
driver.quit()
def main():
file = 'EMedEvents.xlsx' #file to write in
book = open_workbook(file)
sheet = book.sheet_by_index(0)
d = pd.read_excel(file)
if d.empty:
pass
else:
for row in range(1, sheet.nrows):
names.append(sheet.cell(row, 0).value)
dates.append(sheet.cell(row, 1).value)
types.append(sheet.cell(row, 2).value)
location.append(sheet.cell(row, 3).value)
descs.append(sheet.cell(row, 4).value)
views.append(sheet.cell(row, 5).value)
no_speakers.append(sheet.cell(row, 6).value)
pics.append(sheet.cell(row, 7).value)
links.append(sheet.cell(row, 8).value)
organization.append(sheet.cell(row, 9).value)
summ.append(sheet.cell(row, 10).value)
twitter.append(sheet.cell(row, 11).value)
facebook.append(sheet.cell(row, 12).value)
contact.append(sheet.cell(row, 13).value)
emails.append(sheet.cell(row, 14).value)
website_link.append(sheet.cell(row, 15).value)
venue.append(sheet.cell(row, 16).value)
official_address.append(sheet.cell(row, 17).value)
speakers.append(sheet.cell(row, 18).value)
fees.append(sheet.cell(row, 19).value)
at_tr.append(sheet.cell(row, 20).value)
if len(links) != 0:
for item in links:
prev_links.append(item)
main_url("https://www.emedevents.com/india-medical-conferences") #main url to use.
for item in links:
each_event(item) #get people information of each event.
df = pd.DataFrame.from_dict({'Event Name':names,'Event Dates':dates, 'Specialty' : types,'Event Location' : location, 'Description' : descs,
'Views' : views, 'Speakers' : no_speakers, 'Picture Source' : pics, 'Event Link' : links, 'Organized By' : organization,
'Conference Summary' : summ, 'Twitter Link' : twitter, 'Facebook Link' : facebook,'Contact Number' : contact,
'Email' : emails, 'Website Link' : website_link, 'Venue' : venue, 'Official Address' : official_address, 'Speaking' : speakers,
'Fees' : fees, 'Attenders and Trackers': at_tr})
df.to_excel(file, header=True, index=False) #print the data in the excel sheet.
logging.basicConfig(filename = 'error_' + str(time.time()) + '.log', level = logging.INFO)
logging.info('%d events were read from the excel sheet', len(prev_links))
logging.info('%d events were added to the excel sheet', len(links) - len(prev_links))
logging.info('Following are the links of the events that were updated:')
for item in update:
logging.info(item)
if __name__ == '__main__':
main() #if the name is main, run the main method and continue with the program.
The program works on a Windows System with the Chrome exe downloaded and its path provided in the code. I want to make this work on an Ubuntu Linux based system which does not use any exe's.
I know I can use headless drivers to make this work.
I tried making the following change:
driver = webdriver.PhantomJS()
The change does not work properly. It gives me the following warning:
UserWarning: Selenium support for PhantomJS has been deprecated, please use headless versions of Chrome or Firefox instead
warnings.warn('Selenium support for PhantomJS has been deprecated, please use headless '
I do not understand what to do now. What changes do I make in my code so it works on an Ubuntu platform.
Thanks in Advance.

Scraping Instagram with API ?__a=1

I've been trying scraping Instagram posts for a certain hashtag for the keys: display_url, taken_at_timestamp, text, edge_liked_by. This goes perfect for some hundreds in the start, but then stops fetching 'text' keyword only. Other three fields are successfully fetched though. I am not sure why it's happening.
I am parsing the JSON https://www.instagram.com/explore/tags/something/?__a=1.
base_url = "https://www.instagram.com/explore/tags/salonedelmobile/?__a=1"
url = "https://www.instagram.com/explore/tags/salonedelmobile/?__a=1"
while True:
response = url_req.urlopen(url)
json_file = json.load(response)
for i in json_file['graphql']['hashtag']['edge_hashtag_to_media']['edges']:
try:
post_text = i['node']['edge_media_to_caption']['edges'][0]['node']['text']
except IndexError as e:
post_text = e
try:
display_url = i['node']['display_url']
except:
display_url = e
try:
like_count = i['node']['edge_liked_by']['count']
except:
like_count = e
try:
time_stamp = i['node']['taken_at_timestamp']
except:
time_stamp = e
output.append([display_url, like_count, time_stamp, post_text])
df = pd.DataFrame(output,columns=['URL', 'Like Count', 'Time', 'Text'])
try:
df.to_excel('instagram.xlsx')
except:
pass
if json_file['graphql']['hashtag']['edge_hashtag_to_media']['page_info']['has_next_page'] == True:
end_cursor = json_file['graphql']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor']
url = base_url + '&max_id=' + end_cursor
else:
break

Categories