I have gotten this code off of a forum and I have installed all the dependencies but I get a few errors.
I dont know python very well so I decided to look it up, I was not able to fix it that way.
#all cats are yellow
from selenium import webdriver
from bs4 import BeautifulSoup
import time
#Quora Login Information
email=""
passy=""
# File With Questions Here
filey = "fileys.txt"
#Read File ,strip new lines ,return question list
def readFile(filey):
with open(filey, "r") as f:
q = f.readlines()
qlist = [x.strip() for x in q]
# qlist=reversed(qlist) #Will reverse the question list if needed
print len(qlist), "Total Questions Loaded"
return qlist
#Login to Quora
def login(email, passy):
print "Logging in..."
driver.get("http://quora.com")
# Create Soup Object and find all form_column classes
forms = BeautifulSoup(driver.page_source, "lxml").find_all(class_="form_column")
# Iterate through forms
# Find polymorphic id string,append a hashtag(#) to create css_selector
for form in forms:
try:
# This is for email/password entry box
data = form.find("input")["name"]
if data == "email":
email_css = "#" + form.find("input")["id"]
if data == "password":
password_css = "#" + form.find("input")["id"]
except:
pass
try:
# This is for the Login Button
data = form.find("input")["value"]
if data == "Login":
button_css = "#" + form.find("input")["id"]
except:
pass
driver.find_element_by_css_selector(email_css).send_keys(email)
driver.find_element_by_css_selector(password_css).send_keys(passy)
time.sleep(2)
driver.find_element_by_css_selector(button_css).click()
time.sleep(2)
# LOGIN FINISHED
#Create Question List
qlist = readFile(filey)
#Create Webdriver Vroom Vroom
driver = webdriver.Chrome()
#Total Questions Posted Counter
county=0
# Iterate through qlist ask questions till no more
for question in qlist:
try:
print question
driver.get("http://quora.com")
soup=BeautifulSoup(driver.page_source,"lxml")
# Find all text areas
blox = soup.find_all("textarea")
# Find polymorphic id string for Ask Question entry field
for x in blox:
try:
placeholder = x["placeholder"]
if placeholder.__contains__("Ask or Search Quora"): # Fix this later
askbar_css = "#" + x["id"]
print askbar_css
except:
pass
askbutton = "#" + soup.find(class_="AskQuestionButton")["id"]# Fix this later
# Type out Question
driver.find_element_by_css_selector(askbar_css).send_keys(question)
# Wait for askbutton to become clickable
time.sleep(.2) # Fix later
try:
driver.find_element_by_css_selector(askbutton).click()
except:
#Click Failed # Fix later
pass
# Find the popup
while True:
try:
soup = BeautifulSoup(driver.page_source, "lxml")
popExists = soup.find(class_="Modal AskQuestionModal")
break
except:
pass
soup = BeautifulSoup(driver.page_source,"lxml")
popup = "#" + soup.find(class_="submit_button modal_action")["id"]
driver.find_element_by_css_selector(popup).click()
for x in range(0,17):
time.sleep(.1)
try:
soup = BeautifulSoup(driver.page_source, "lxml")
popExists = soup.find(class_="PMsgContainer") #Found Popup
if str(popExists).__contains__("You asked"): #big no no
county += 1
break
except:
pass
print "county=>",county
except Exception,e:
print e
print "ERROR"
pass
So the code opens chrome and it loads quora, however it gets stuck on logging in and the script ends. I am also on a mac. I get the following error:
'NoneType' object has no attribute '__getitem__'
ERROR
Related
Issue
I am trying to retrieve reviews from a particular website using selenium and python. I initially was able to iterate through the web pages of the website as they were paginated with page numbers but since they changed how they paginate their web pages, I have been having problems in iterating through the pages and scraping the data I require.
The code block I use to open the website, retrieve the href link and append it to the URL is:
loc_dict = {'https://www.trustpilot.com/review/www.veinclinics.com'}
proxies = ['172.241.244.85:29842',
'23.106.16.58:29842',
'23.80.148.127:29842',
]
used_user_agents = []
stdev_time = 2
mean_time = 15
with open('user_agents.txt') as f:
user_agents = []
for line in f:
line = line.replace("\n","")
user_agents.append(line)
print "Using "+str(len(user_agents))+" user agents"
soup_list = []
fail_list =[]
for item in loc_dict:
flag = "bad"
while flag == "bad":
print item
wait_time = abs(random.gauss(mean_time,stdev_time))
#selecting random user agent
used_agent = user_agents[int(random.uniform(1,len(user_agents)))]
print "Using agent: " + used_agent
while used_agent in used_user_agents:
used_agent = user_agents[int(random.uniform(1,len(user_agents)))]
used_user_agents.append(used_agent)
if len(used_user_agents) > 4:
used_user_agents = []
#random proxy
PROXY = proxies[int(random.uniform(1,len(proxies)))]
print PROXY
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument('--proxy-server=%s' % PROXY)
#options.add_argument("user-agent=%s" % used_agent)
driver = webdriver.Chrome(r'C:\Users\svaddi\Trustpilot-ADCS\chromedriver.exe')
wait = WebDriverWait(driver, 10)
driver.get(item)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
time.sleep(wait_time)
driver.close()
soup = BeautifulSoup(html,'lxml')
browser_check = soup.find('div', attrs={'class':'row'})
print browser_check
if browser_check==None:
print "browser worked"
flag= "good"
else:
if "no longer supports this browser" in str(browser_check):
time.sleep(120.0)
continue
else:
print "browser worked but has that row"
flag= "good"
find_all_a = soup.find_all("a", {"rel":"next"}, href=True)
for el in find_all_a:
tags = el['href']
print tags
clickable_link = ['https://www.trustpilot.com' + str(tags)]
real_url = ''.join(clickable_link)
print real_url
real_url gets me the url for the next page that I'd need to scrape. How do I update loc_dict with real_url?
There are several issues here... starting with the fact that loc_dict is actually a set, not a dict. Given how you're using it as more of a list-y/set-y structure, that fact doesn't seem to be an issue.
However, if you attempt to modify the set while you're iterating over it (using the for iterator, you'll get an error like this: RuntimeError: Set changed size during iteration
So what you might want to do is have two sets: one of urls to process, and one of urls that you're done with. Then iterate with a while loop and a condition, so that you're not technically iterating over the set, just its condition. Like so:
loc_set = {'https://www.trustpilot.com/review/www.veinclinics.com'}
processed_set = set()
proxies = ['172.241.244.85:29842',
'23.106.16.58:29842',
'23.80.148.127:29842',
]
used_user_agents = []
stdev_time = 2
mean_time = 15
with open('user_agents.txt') as f:
user_agents = []
for line in f:
line = line.replace("\n","")
user_agents.append(line)
print "Using "+str(len(user_agents))+" user agents"
soup_list = []
fail_list =[]
while len(loc_set) > 0:
item = loc_set.pop()
processed_set.add(item)
flag = "bad"
while flag == "bad":
print item
wait_time = abs(random.gauss(mean_time,stdev_time))
#selecting random user agent
used_agent = user_agents[int(random.uniform(1,len(user_agents)))]
print "Using agent: " + used_agent
while used_agent in used_user_agents:
used_agent = user_agents[int(random.uniform(1,len(user_agents)))]
used_user_agents.append(used_agent)
if len(used_user_agents) > 4:
used_user_agents = []
#random proxy
PROXY = proxies[int(random.uniform(1,len(proxies)))]
print PROXY
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument('--proxy-server=%s' % PROXY)
#options.add_argument("user-agent=%s" % used_agent)
driver = webdriver.Chrome(r'C:\Users\svaddi\Trustpilot-ADCS\chromedriver.exe')
wait = WebDriverWait(driver, 10)
driver.get(item)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
time.sleep(wait_time)
driver.close()
soup = BeautifulSoup(html,'lxml')
browser_check = soup.find('div', attrs={'class':'row'})
print browser_check
if browser_check==None:
print "browser worked"
soup_list.append([placeid,soup])
flag= "good"
else:
if "no longer supports this browser" in str(browser_check):
time.sleep(120.0)
continue
else:
print "browser worked but has that row"
soup_list.append([placeid,soup])
flag= "good"
find_all_a = soup.find_all("a", {"rel":"next"}, href=True)
for el in find_all_a:
tags = el['href']
print tags
clickable_link = ['https://www.trustpilot.com' + str(tags)]
real_url = ''.join(clickable_link)
print real_url
if real_url not in processed_set and real_url not in loc_set:
loc_set.add(real_url)
Now, because we've added each item to processed_set, it will contain each of the URLs that you wanted to process. And since you pop() all of the items out of loc_set (what I've re-named loc_dict), it will terminate the loop when all of the URLs have been processed. I also added a catch in there at the end:
if real_url not in processed_set and real_url not in loc_set: ...
This is to make sure that you don't process any given link more than once.
Also, this code appears to be written in Python 2, which has been dead for over a year now. Move to Python 3. If you're on a mac, it's python3 since they refuse to alias python.
I am trying to learn how to do WhatsApp bot. so I took someone's code from the internet and tried to change it so it will fit my WhatsApp, the problem is when i run it he cant find the unread messages and always pressing the second chat i have in my chat list.
error line: list index out of range
here is the code, I hope you will be able to help me with this :)
source code: https://blog.usejournal.com/build-a-basic-news-fetching-whatsapp-bot-in-python-under-60-lines-of-code-2d992faf7f79
from logging import root
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import time
from urllib3.util import url
browser = webdriver.Firefox(executable_path='realPathIsHere')
browser.get('https://web.whatsapp.com')
def getNews():
text_box = browser.find_element_by_class_name("_3uMse")
response = "Let me fetch and send top 5 latest news:\n"
text_box.send_keys(response)
soup = BeautifulSoup(requests.get(url).content, "html5lib")
articles = soup.find_all('article',
class_="MQsxIb xTewfe R7GTQ keNKEd j7vNaf Cc0Z5d YKEnGe EyNMab t6ttFe Fm1jeb EjqUne")
news = [i.find_all('a', class_="ipQwMb Q7tWef")[0].text for i in articles[:5]]
links = [root + i.find('a')['href'][1:] for i in articles[:5]]
links = [requests.get("http://thelink.la/api-shorten.php?url=" + link).content.decode() for link in links]
for i in range(5):
text_box.send_keys(news[i] + "==>" + links[i] + "\n")
bot_users = {} # A dictionary that stores all the users that sent activate bot
while True:
unread = browser.find_elements_by_class_name("ZKn2B")
name, message = '', ''
if len(unread) > 0:
ele = unread[-1]
action = webdriver.common.action_chains.ActionChains(browser)
action.move_to_element_with_offset(ele, 0, -20) # move a bit to the left from the green dot
# Clicking couple of times because sometimes whatsapp web responds after two clicks
try:
action.click()
action.perform()
action.click()
action.perform()
except Exception as e:
pass
try:
name = browser.find_element_by_class_name("Pv-sE").text # Contact name
message = browser.find_elements_by_class_name("vW7d1")[-1]
if 'activate bot' in message.text.lower():
if name not in bot_users:
bot_users[name] = True
text_box = browser.find_element_by_class_name("_3uMse")
response = "Hi " + name + ". Tal's Bot here :). Now I am activated for you\n"
text_box.send_keys(response)
if name in bot_users:
if 'show' in message.text.lower() and 'news' in message.text.lower():
getNews()
if 'deactivate' in message.text.lower():
if name in bot_users:
text_box = browser.find_element_by_class_name("_3uMse")
response = "Bye " + name + ".\n"
text_box.send_keys(response)
del bot_users[name]
except Exception as e:
print(e)
pass
sleep(2) # A 2 second pause so that the program doesn't run too fast
I dont know why but this is working now :)
I wrote a python code for web scraping so that I can import the data from flipkart.
I need to load multiple pages so that I can import many products but right now only 1 product page is coming.
from urllib.request import urlopen as uReq
from requests import get
from bs4 import BeautifulSoup as soup
import tablib
my_url = 'https://www.xxxxxx.com/food-processors/pr?sid=j9e%2Cm38%2Crj3&page=1'
uClient2 = uReq(my_url)
page_html = uClient2.read()
uClient2.close()
page_soup = soup(page_html, "html.parser")
containers11 = page_soup.findAll("div",{"class":"_3O0U0u"})
filename = "FoodProcessor.csv"
f = open(filename, "w", encoding='utf-8-sig')
headers = "Product, Price, Description \n"
f.write(headers)
for container in containers11:
title_container = container.findAll("div",{"class":"_3wU53n"})
product_name = title_container[0].text
price_con = container.findAll("div",{"class":"_1vC4OE _2rQ-NK"})
price = price_con[0].text
description_container = container.findAll("ul",{"class":"vFw0gD"})
product_description = description_container[0].text
print("Product: " + product_name)
print("Price: " + price)
print("Description" + product_description)
f.write(product_name + "," + price.replace(",","") +"," + product_description +"\n")
f.close()
You have to check if the next page button exist or not. If yes then return True, go to that next page and start scraping if no then return False and move to the next container. Check for the class name of that button first.
# to check if a pagination exists on the page:
def go_next_page():
try:
button = driver.find_element_by_xpath('//a[#class="<class name>"]')
return True, button
except NoSuchElementException:
return False, None
You can Firstly get the number of pages available and iterate over for each of the pages and parse the data respectively.
Like if you change the URL with respect to page
'https://www.flipkart.com/food-processors/pr?sid=j9e%2Cm38%2Crj3&page=1' which points to page 1
'https://www.flipkart.com/food-processors/pr?sid=j9e%2Cm38%2Crj3&page=2' which points to page 2
try:
next_btn = driver.find_element_by_xpath("//a//span[text()='Next']")
next_btn.click()
except ElementClickInterceptedException as ec:
classes = "_3ighFh"
overlay = driver.find_element_by_xpath("(//div[#class='{}'])[last()]".format(classes))
driver.execute_script("arguments[0].style.visibility = 'hidden'",overlay)
next_btn = driver.find_element_by_xpath("//a//span[text()='Next']")
next_btn.click()
except Exception as e:
print(str(e.msg()))
break
except TimeoutException:
print("Page Timed Out")
driver.quit()
For me, the easiest way is to add an extra loop with the "page" variable:
# just check the number of the last page on the website
page = 1
while page != 10:
print(f'Scraping page: {page}')
my_url = 'https://www.xxxxxx.com/food-processors/pr?sid=j9e%2Cm38%2Crj3&page={page}'
# here add the for loop you already have
page += 1
This method should work.
I have created a website scraper which will scrape all info from yellow pages (for educational purposes)
def actual_yellow_pages_scrape(link,no,dir,gui,sel,ypfind,terminal,user,password,port,type):
print(link,no,dir,gui,sel,ypfind,terminal,user,password,port,type)
r = requests.get(link,headers=REQUEST_HEADERS)
soup = BeautifulSoup(r.content,"html.parser")
workbook = xlwt.Workbook()
sheet = workbook.add_sheet(str(ypfind))
count = 0
for i in soup.find_all(class_="business-name"):
sheet.write(count,0,str(i.text))
sheet.write(count,1,str("http://www.yellowpages.com"+i.get("href")))
r1 = requests.get("http://www.yellowpages.com"+i.get("href"))
soup1 = BeautifulSoup(r1.content,"html.parser")
website = soup1.find("a",class_="custom-link")
try:
print("Acquiring Website")
sheet.write(count,2,str(website.get("href")))
except:
sheet.write(count,2,str("None"))
email = soup1.find("a",class_="email-business")
try:
print(email.get("href"))
EMAIL = re.sub("mailto:","",str(email.get("href")))
sheet.write(count,3,str(EMAIL))
except:
sheet.write(count,3,str("None"))
phonetemp = soup1.find("div",class_="contact")
try:
phone = phonetemp.find("p")
print(phone.text)
sheet.write(count,4,str(phone.text))
except:
sheet.write(count,4,str("None"))
reviews = soup1.find(class_="count")
try:
print(reviews.text)
sheet.write(count,5,str(reviews.text))
except:
sheet.write(count,5,str("None"))
count+=1
save = dir+"\\"+ypfind+str(no)+".xls"
workbook.save(save)
no+=1
for i in soup.find_all("a",class_="next ajax-page"):
print(i.get("href"))
actual_yellow_pages_scrape("http://www.yellowpages.com"+str(i.get("href")),no,dir,gui,sel,ypfind,terminal,user,password,port,type)
The code is my above portion of the scraper. I have created the break points at soup and in the for loop not even a single line of for loop gets executed. No errors thrown. I tried the same with printing numbers from 1-10 it works but this is not working why?
Thank you
Answer has been found,
I used a text visulaizer to find what is in "r.content" I soupified it and got a clean HTML and gone through the HTML file and finally found that the browser is unsupported so I removed the requests header and ran the code finally got what I wanted
while True:
for rate in soup.find_all('div',{"class":"rating"}):
if rate.img is not None:
print (rate.img['alt'])
try:
driver.find_element_by_link_text('Next').click()
except:
break
driver.quit()
while True:
for rate in soup.findAll('div',{"class":"listing_title"}):
print (rate.a.text)
try:
driver.find_element_by_link_text('Next').click()
except:
break
driver.quit()
This should do what you're looking for. You should grab the parent class of both (I chose .listing, and get each attribute from there, insert them in a dict, and then write the dicts to CSV with the Python CSV library. Just as a fair warning, I didn't run it until it broke, I just broke after the second loop to save some computing.
WARNING HAVE NOT TESTED ON FULL SITE
import csv
import time
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
url = 'http://www.tripadvisor.in/Hotels-g186338-London_England-Hotels.html'
driver = webdriver.Firefox()
driver.get(url)
hotels = []
while True:
html = driver.page_source
soup = BeautifulSoup(html)
listings = soup.select('div.listing')
for l in listings:
hotel = {}
hotel['name'] = l.select('a.property_title')[0].text
hotel['rating'] = float(l.select('img.sprite-ratings')[0]['alt'].split('of')[0])
hotels.append(hotel)
next = driver.find_element_by_link_text('Next')
if not next:
break
else:
next.click()
time.sleep(0.5)
if len(hotels) > 0:
with open('ratings.csv', 'w') as f:
fieldnames = [ k for k in hotels[0].keys() ]
writer = csv.DictWriter(f,fieldnames=fieldnames)
writer.writeheader()
for h in hotels:
writer.writerow(h)
driver.quit()
You should look at using a list.
I would try something like this:
for rate in soup.findAll('div',{"class":["rating","listing_title"]}):
(could be wrong, this machine doesn't have bs4 for me to check, sorry)