I wrote a script to save LinkedIn information like: name, last name, graduated university and most important link to LinkedIn script. My script is using Selenium and chromedriver to enter LinkedIn and then scrape. My problem is with saving profile links. Links aren't scraping properly. Here's my code:
import csv
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.keys import Keys
import parameters
import re
class LinkedIn():
def __init__(self):
self.driver = webdriver.Chrome()
self.people_ls_dic = []
self.csv_name_colums = ["name","degree_connection","zawod","region","opis","firma","link"]
def login(self):
self.driver.get("http://www.linkedin.com/login")
sleep(3)
username = self.driver.find_element_by_name('session_key')
username.send_keys(parameters.linkedin_username)
password = self.driver.find_element_by_name('session_password')
password.send_keys(parameters.linkedin_password)
sign_in_button = self.driver.find_elements_by_xpath('//*[#class="btn__primary--large from__button--floating mercado-button--primary"]')
sign_in_button[0].click()
sleep(5)
def neville_try(self):
sleep(3)
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
profiles = self.driver.find_element_by_xpath('/html/body/div[7]/div[3]/div/div[2]/div/div/div/div[2]/ul')
profiles = profiles.find_elements_by_css_selector('li')
profiles = [(i.text, i.find_element_by_xpath('//*[#data-control-name="entity_result"]').get_attribute('href')) for i in profiles]
print("\n\n")
info_ls = []
for profile, link in profiles:
info_ls.append( (profile.split('\n'), link) )
for iteam, link in info_ls:
if 'Learn more' in iteam:
info_ls.remove(iteam)
print(info_ls)
info_ls = [(iteam, link) for iteam, link in info_ls if iteam != ['']]
for info, link in info_ls:
if info[0] == info[1]:
info.remove(info[1])
try:
name = info[0]
degree_connection = info[2]
zawod = info[3]
region = info[4]
opis = info[5]
opis_f = opis.replace(","," ")
list_of_user_data = [name, zawod, opis_f]
for data in list_of_user_data:
try:
comp = re.findall('at ([a-zA-Z0-9]+)',data)
firma = comp[0]
break
except:
continue
if comp == []:
firma = "brak_danych"
self.people_ls_dic.append({"name":name,"degree_connection":degree_connection,"zawod":zawod,"region":region,"opis":opis,"firma":firma,"link":link})
except:
pass
def go_home(self):
home = self.driver.find_element_by_xpath('//*[#id="inbug-nav-item"]/a')
home.click()
def next_page(self):
sleep(3)
next_p = self.driver.find_element_by_xpath('//*[#aria-label="Next"]')
next_p.click()
def open_people(self):
self.driver.get("https://www.linkedin.com/search/results/people/?origin=DISCOVER_FROM_SEARCH_HOME")
sleep(2)
search_bar = self.driver.find_element_by_xpath('//*[#class="search-global-typeahead__input always-show-placeholder"]')
search_bar.send_keys(parameters.search_query)
search_bar.send_keys(Keys.ENTER)
sleep(3)
def filter_company(self):
cl = self.driver.find_element_by_xpath('//*[#aria-label="Current company filter. Clicking this button displays all Current company filter options."]')
cl.click()
for comp in parameters.list_of_comp:
text = self.driver.find_element_by_xpath('//*[#placeholder="Add a company"]')
text.send_keys(comp)
sleep(1)
filt = self.driver.find_element_by_xpath('/html/body/div[7]/div[3]/div/div[1]/nav/div/div[1]/div/div[2]/ul/li[5]/div/div/div/div[1]/div/form/fieldset/div[1]/div/div/div[2]/div/div[2]')
sleep(0.2)
filt.click()
sleep(1)
apply = self.driver.find_element_by_xpath('/html/body/div[7]/div[3]/div/div[1]/nav/div/div[1]/div/div[2]/ul/li[5]/div/div/div/div[1]/div/form/fieldset/div[2]/button[2]')
apply.click()
sleep(1)
def close(self):
self.driver.close()
def write_to_csv(self):
csv_file = "neville.csv"
with open(csv_file, 'w', encoding="utf-8", newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames = self.csv_name_colums)
writer.writeheader()
for data in self.people_ls_dic:
writer.writerow(data)
scrypt = LinkedIn()
scrypt.login()
scrypt.open_people()
ls = range(parameters.ilosc_stron)
scrypt.filter_company()
for i in sorted(ls,reverse=True):
scrypt.neville_try()
if i == 1:
break
scrypt.next_page()
scrypt.write_to_csv()
scrypt.close()
Ofc I have file with parameters and i looks' like this:
linkedin_username = ""
linkedin_password = ""
search_query = 'vcloud director'
list_of_comp = ['Microsoft']
ilosc_stron = 2 //number of pages to click on
Related
I am trying to take an account of over 1,000,000 followers on instagram and add their usernames to a txt file. I am trying to use Selenium for this but my authorization for login fails every time i login. Any advice on how to get around this? I assume the site believes this is a hack but im not sure.
from selenium import webdriver as web
from selenium.webdriver.common.keys import Keys
import time
import random
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
bot_username = 'null'
bot_password = 'Null'
profiles = ['Enter Here']
amount = 300
# 'usernames' or 'links'
result = 'usernames'
us = ''
class Instagram():
def __init__(self, username, password):
self.username = username
self.password = password
options = Options()
options.add_experimental_option("excludeSwitches", ["enable-logging"])
self.browser = web.Chrome("chromedriver",options=options)
self.browser.set_window_size(400, 900)
def close_browser(self):
self.browser.close()
self.browser.quit()
def login(self):
browser = self.browser
try:
browser.get('https://www.instagram.com')
time.sleep(random.randrange(3, 5))
# Enter username:
username_input = browser.find_element_by_name('username')
username_input.clear()
username_input.send_keys(self.username)
time.sleep(random.randrange(2, 4))
# Enter password:
password_input = browser.find_element_by_name('password')
password_input.clear()
password_input.send_keys(self.password)
time.sleep(random.randrange(1, 2))
password_input.send_keys(Keys.ENTER)
time.sleep(random.randrange(3, 5))
print(f'[{self.username}] Successfully logged on!')
except Exception as ex:
print(f'[{self.username}] Authorization fail')
self.close_browser()
def xpath_exists(self, url):
browser = self.browser
try:
browser.find_element_by_xpath(url)
exist = True
except NoSuchElementException:
exist = False
return exist
def get_followers(self, users, amount):
browser = self.browser
followers_list = []
for user in users:
browser.get('https://instagram.com/' + user)
time.sleep(random.randrange(3, 5))
followers_button = browser.find_element_by_xpath('/html/body/div[1]/section/main/div/ul/li[2]/a/span')
count = followers_button.get_attribute('title')
if ',' in count:
count = int(''.join(count.split(',')))
else:
count = int(count)
if amount > count:
print(f'You set amount = {amount} but there are {count} followers, then amount = {count}')
amount = count
followers_button.click()
loops_count = int(amount / 12)
print(f'Scraping. Total: {amount} usernames. Wait {loops_count} iterations')
time.sleep(random.randrange(8,10))
followers_ul = browser.find_element_by_xpath("/html/body/div[6]/div/div/div[2]")
time.sleep(random.randrange(5,7))
try:
for i in range(1, loops_count + 1):
browser.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", followers_ul)
time.sleep(random.randrange(8, 10))
all_div = followers_ul.find_elements_by_tag_name("li")
for us in all_div:
us = us.find_element_by_tag_name("a").get_attribute("href")
if result == 'usernames':
us1 = us.replace("https://www.instagram.com/", "")
us = us1.replace("/", "")
followers_list.append(us)
time.sleep(1)
f3 = open('userlist.txt', 'w')
for list in followers_list:
f3.write(list + '\n')
print(f'Got: {len(followers_list)} usernames of {amount}. Saved to file.')
time.sleep(random.randrange(3, 5))
except Exception as ex:
print(ex)
self.close_browser()
return followers_list
bot = Instagram(bot_username, bot_password)
bot.login()
followers = bot.get_followers(profiles, amount)
i tried to create a threading to login with a lot of usage, using python Threading, but I got a little error, my code can't handle the following task, I have 10 users, and run threading with threading, python code stops after 2 user logins, and the rest cannot be called, some code please add to fix this problem
python
print (variable)
import threading
from selenium import webdriver
import time
import numpy as np
from csv import reader
def Login_browser(i):
id_str = ids_pass_list[i][0]
id_pass = ids_pass_list[i][1]
print(i)
print("Login Id: ", id_str)
print("Login Password: ", id_pass)
Options = webdriver.ChromeOptions()
Options.add_argument('--app-https://google.com')
driver = webdriver.Chrome(options=Options)
driver.get('http://localhost/login2/index.php')
time.sleep(1)
try:
driver.get('http://localhost/login2/index.php')
time.sleep(1)
email_field = driver.find_element_by_name("userid")
email_field.send_keys(id_str)
time.sleep(3)
password_field = driver.find_element_by_name("pass")
password_field.send_keys(id_pass)
time.sleep(3)
submit = driver.find_element_by_xpath("/html/body/center/form/table/tbody/tr[3]/td/button[1]")
submit.click()
time.sleep(30)
driver.quit()
except:
time.sleep(10)
Page_Thread = 2
threads = []
with open('user.csv', 'r') as read_obj:
csv_reader = reader(read_obj)
list_of_rows = list(csv_reader)
total_Len = len(list_of_rows)
ids_pass_list = list_of_rows
for i in range(Page_Thread):
threads += [threading.Thread(target=Login_browser,args={i},)]
for t in threads:
t.start()
for t in threads:
t.join
print ( 'succses')
So I wrote this code to get the list of followers on Instagram using instaloader library in python :
login_name = 'beyondhelloworld'
target_profile = 'femindharamshi'
# OR
#import sys
#target_profile = sys.argv[1] # pass in target profile as argument
from instaloader import Instaloader, Profile
loader = Instaloader()
# login
try:
loader.load_session_from_file(login_name)
except FileNotFoundError:
loader.context.log("Session file does not exist yet - Logging in.")
if not loader.context.is_logged_in:
loader.interactive_login(login_name)
loader.save_session_to_file()
profile = Profile.from_username(loader.context, target_profile)
followers = profile.get_followers()
loader.context.log()
loader.context.log('Profile {} has {} followers:'.format(profile.username, profile.followers))
loader.context.log()
for follower in followers:
loader.context.log(follower.username, flush=True)
But I keep getting this error :
Loaded session from /Users/femindharamshi/.config/instaloader/session-beyondhelloworld.
Traceback (most recent call last):
File "/Users/femindharamshi/Documents/instaload/env/lib/python3.7/site-packages/instaloader/structures.py", line 597, in _obtain_metadata
self._node = metadata['entry_data']['ProfilePage'][0]['graphql']['user']
KeyError: 'graphql'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "il.py", line 20, in <module>
profile = Profile.from_username(loader.context, target_profile)
File "/Users/femindharamshi/Documents/instaload/env/lib/python3.7/site-packages/instaloader/structures.py", line 552, in from_username
profile._obtain_metadata() # to raise ProfileNotExistException now in case username is invalid
File "/Users/femindharamshi/Documents/instaload/env/lib/python3.7/site-packages/instaloader/structures.py", line 606, in _obtain_metadata
', '.join(similar_profiles[0:5]))) from err
instaloader.exceptions.ProfileNotExistsException: Profile femindharamshi does not exist.
The most similar profile is: femindharamshi.
How do I solve this issue?
The output says that profile "femindharamshi" does not exist but that is what my profile is. It also says :
The most similar profile is: femindharamshi.
import instaloader
import random
import os
dir_path_driver = os.getcwd()
def username_password():
listusername = []
with open("./username.txt","r") as usernames:
for username in usernames:
listusername.append((username.rstrip("\n")).split(":"))
if len(listusername) == 1:
select = 0
else:
select = random.randint(0,len(listusername))
return listusername[select][0],listusername[select][1]
def get_followers():
L = instaloader.Instaloader()
# Login or load session
username,password =username_password()
listfile = os.listdir(dir_path_driver+"/cookie")
for i in listfile:
if i != f"{username}":
L.login(username, password)
L.save_session_to_file(filename=dir_path_driver+"/cookie/"+f"{username}")
else:
L.load_session_from_file(filename=dir_path_driver+"/cookie/"+f"{username}",username = username)
file = open("prada_followers.txt","a+")
profile = instaloader.Profile.from_username(L.context, "idinstagram")
for followee in profile.get_followers():
username = followee.username
file.write(username + "\n")
file.close()
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
class InstaBot:
"""InstaBot can login, can return unfollowers that don't
follow you back.
Object requires two args.
'Username' & 'Password' """
def __init__(self,username,pw):
self.username = username
self.pw = pw
self.driver = webdriver.Chrome(executable_path='chromedriver.exe')
self.base_url = "https://instagram.com"
self.driver.get("{}".format(self.base_url))
sleep(2)
self.driver.maximize_window()
self.login()
def login(self):
self.driver.find_element_by_xpath("//input[#name=\"username\"]")\
.send_keys(self.username)
self.driver.find_element_by_xpath("//input[#name=\"password\"]")\
.send_keys(self.pw)
self.driver.find_element_by_xpath("//button[#type=\"submit\"]")\
.click()
sleep(10)
self.driver.find_element_by_xpath("//button[contains(text(), 'Not Now')]")\
.click()
sleep(2)
def get_unfollowers(self):
self.driver.find_element_by_xpath("//a[contains(#href, '/{}')]".format(self.username))\
.click()
sleep(3)
self.driver.find_element_by_xpath("//a[contains(#href, '/following')]")\
.click()
sleep(2)
following = self._get_names()
self.driver.find_element_by_xpath("//a[contains(#href, '/followers')]")\
.click()
sleep(2)
followers = self._get_names()
not_following_back = [user for user in following if user not in followers]
return not_following_back
## suggetions = self.driver.find_element_by_xpath('//h4[contains(text(), Suggetions)]')
## self.driver.execute_script('arguments[0].scrollIntoView()',suggetions)
def _get_names(self):
scroll_box = self.driver.find_element_by_xpath("/html/body/div[4]/div/div[2]")
last_ht , ht = 0,1
while last_ht != ht:
last_ht = ht
sleep(1)
ht = self.driver.execute_script("""
arguments[0].scrollTo(0,arguments[0].scrollHeight);
return arguments[0].scrollHeight;
""", scroll_box)
links = scroll_box.find_elements_by_tag_name('a')
names = [name.text for name in links if name.text != '']
sleep(2)
self.driver.find_element_by_xpath("/html/body/div[4]/div/div[1]/div/div[2]/button")\
.click()
return names
def navigate_to_user(self,user):
self.driver.get("{}/{}".format(self.base_url,user))
def scroll_down(self):
last_height = self.driver.execute_script("return document.body.scrollHeight")
while True:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(2)
new_height = self.driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
my_bot = InstaBot(Username,Password)
##unfollowers = my_bot.get_unfollowers() #will return a list
my_bot.navigate_to_user(Any User Name that you follow) #Will return your friend's followers list
I am new to python and I need help with web scraping code to save a dynamic map every week.
This is the site I am interested in.
The purpose is to get to the page, select season, select week, and download image to a local folder. I'll use the image to integrate for an automated weekly report using SAS.
thank you in advance!
import sys
import os
import time
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium import webdriver
import arrow
BASE_URL = 'https://gis.cdc.gov/grasp/fluview/main.html'
DOWNLOAD_PATH = "/Users/"
def closeWebDriver(driver):
if os.name == 'nt':
driver.quit()
else:
driver.close()
def getImage():
profile = FirefoxProfile()
profile.set_preference("browser.download.panel.shown", False)
profile.set_preference("browser.helperApps.neverAsk.openFile","image/png")
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "image/png")
profile.set_preference("browser.download.folderList", 2);
profile.set_preference("browser.download.dir", DOWNLOAD_PATH)
driver = webdriver.Firefox(firefox_profile=profile)
driver.get(BASE_URL)
time.sleep(5)
if not isValidTimeFrame(driver):
print('Not the time to download yet!')
closeWebDriver(driver)
return
selectFirstWeek(driver)
print('- Consume the web.')
wrapper = driver.find_element_by_class_name('downloads-help-area')
download_img_els = wrapper.find_elements_by_class_name('downloads-button')
for el in download_img_els:
text = el.text.encode('utf-8')
# print(text)
if 'download image' == text.strip().lower():
# Process
downloadImage(el)
break
time.sleep(5)
closeWebDriver(driver)
def isValidTimeFrame(driver):
seasons_button = driver.find_element_by_class_name('seasons-button')
time_frame = seasons_button.text.encode('utf-8').strip().lower()
current_year = arrow.now().to('local')
current_year_str = current_year.format('YYYY')
next_year = current_year.shift(years=1)
next_year_str = next_year.format('YY')
print(time_frame)
compare_year = '%s-%s' % (current_year_str, next_year_str)
return time_frame == compare_year
def selectFirstWeek(driver):
prev = driver.find_element_by_id('prevMap')
week = driver.find_element_by_id('weekSlider')
while True:
print(week)
current_number = week.get_property('value')
print('- Week: ' + current_number)
prev.click()
if int(current_number) < 2:
break;
time.sleep(1)
def downloadImage(el):
print('- Click on ' + el.text)
el.click()
getImage()
I use this script to get a list of all file updates to a certain directory. I then parse that list to get a list of time slots I have been active in that directory. That way I can quickly see how much time I have spent on the project and know what to charge my client.
I have written a small python script, adapted from this: https://github.com/jncraton/PythonDropboxUploader
I added the bottom function to retrieve a specific events page from https://www.dropbox.com/events?ns=false&n=50
I have used the script before 2 months ago and it worked well, but now I am getting 403: forbidden errors on:
eventSrc = self.browser.open(req).read()
Probably DropBox tries to block scrapers like mine to push programmers to use their API instead, but unfortunately the API doesn't support listing the events.
Can anybody help me out to get it working again?
This is the python code to create the connection:
import mechanize
import urllib
import re
import json
class DropboxConnection:
""" Creates a connection to Dropbox """
email = ""
password = ""
root_ns = ""
token = ""
browser = None
def __init__(self, email, password):
self.email = email
self.password = password
self.login()
self.get_constants()
def login(self):
""" Login to Dropbox and return mechanize browser instance """
# Fire up a browser using mechanize
self.browser = mechanize.Browser()
self.browser.set_handle_equiv(False)
self.browser.set_handle_redirect(True)
self.browser.set_handle_referer(True)
self.browser.set_handle_robots(False)
self.browser.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:14.0) Gecko/20120722 Firefox/14.0.1')]
# Browse to the login page
self.browser.open('https://www.dropbox.com/login')
# Enter the username and password into the login form
isLoginForm = lambda l: l.action == "https://www.dropbox.com/login" and l.method == "POST"
try:
self.browser.select_form(predicate=isLoginForm)
except:
self.browser = None
raise(Exception('Unable to find login form'))
self.browser['login_email'] = self.email
self.browser['login_password'] = self.password
self.browser['t'] = "1230"
# Send the form
response = self.browser.submit()
def get_constants(self):
""" Load constants from page """
home_src = self.browser.open('https://www.dropbox.com/home').read()
try:
self.root_ns = re.findall(r"root_ns: (\d+)", home_src)[0]
self.token = re.findall(r"TOKEN: '(.+)'", home_src)[0]
except:
raise(Exception("Unable to find constants for AJAX requests"))
def upload_file(self, local_file, remote_dir, remote_file):
""" Upload a local file to Dropbox """
if(not self.is_logged_in()):
raise(Exception("Can't upload when not logged in"))
self.browser.open('https://www.dropbox.com/')
# Add our file upload to the upload form
isUploadForm = lambda u: u.action == "https://dl-web.dropbox.com/upload" and u.method == "POST"
try:
self.browser.select_form(predicate=isUploadForm)
except:
raise(Exception('Unable to find upload form'))
self.browser.form.find_control("dest").readonly = False
self.browser.form.set_value(remote_dir, "dest")
self.browser.form.add_file(open(local_file, "rb"), "", remote_file)
# Submit the form with the file
self.browser.submit()
def get_dir_list(self, remote_dir):
""" Get file info for a directory """
if(not self.is_logged_in()):
raise(Exception("Can't download when not logged in"))
req_vars = "ns_id=" + self.root_ns + "&referrer=&t=" + self.token
req = urllib2.Request('https://www.dropbox.com/browse' + remote_dir, data=req_vars)
req.add_header('Referer', 'https://www.dropbox.com/home' + remote_dir)
dir_info = json.loads(self.browser.open(req).read())
dir_list = {}
for item in dir_info['file_info']:
# Eliminate directories
if(item[0] == False):
# get local filename
absolute_filename = item[3]
local_filename = re.findall(r".*\/(.*)", absolute_filename)[0]
# get file URL and add it to the dictionary
file_url = item[8]
dir_list[local_filename] = file_url
return dir_list
def get_download_url(self, remote_dir, remote_file):
""" Get the URL to download a file """
return self.get_dir_list(remote_dir)[remote_file]
def download_file(self, remote_dir, remote_file, local_file):
""" Download a file and save it locally """
fh = open(local_file, "wb")
fh.write(self.browser.open(self.get_download_url(remote_dir, remote_file)).read())
fh.close()
def is_logged_in(self):
""" Checks if a login has been established """
if(self.browser):
return True
else:
return False
def getEventsPage(self, n):
if(not self.is_logged_in()):
raise(Exception("Can't get event page when not logged in"))
url = 'https://www.dropbox.com/next_events'
values = {'cur_page': n, 'ns_id': 'false'}
data = urllib.urlencode(values)
req = mechanize.Request(url, data)
# print url + '?' + data
eventSrc = self.browser.open(req).read()
return eventSrc
And this is the loop that parses the events pages:
from dbupload import DropboxConnection
from getpass import getpass
from bs4 import BeautifulSoup
import re
import parsedatetime.parsedatetime as pdt
import parsedatetime.parsedatetime_consts as pdc
c = pdc.Constants()
p = pdt.Calendar(c)
email = "myemail#gmail.com" # raw_input("Enter Dropbox email address:")
password = getpass("Enter Dropbox password:")
dateFile = open('all_file_updates.txt', "wb")
try:
# Create the connection
conn = DropboxConnection(email, password)
except:
print("Connection failed")
else:
print("Connection succesful")
n = 250
found = 0
while(n >= 0):
eventsPageSrc = conn.getEventsPage(n)
soup = BeautifulSoup(eventsPageSrc)
table = soup.find("table", {"id": "events"})
for row in table.findAll('tr'):
link = row.find("a", href=re.compile('^https://dl-web.dropbox.com/get/ProjectName'))
if(link != None):
dateString = row.find("td", attrs={'class': 'modified'}).string
date = p.parse(dateString)
dateFile.write('Date: ' + str(date) + ' file: ' + link.string + '\n')
found = found + 1
n = n - 1
print 'page: ' + str(n) + ' Total found: ' + str(found)
In def get_constants(self): change
self.token = re.findall(r"TOKEN: '(.+)'", home_src)[0]
to
self.token = re.findall(r'TOKEN: "(.+)"', home_src)[0]
dropbox has changed the way it stores constants
Hope it helps.