I am trying to learn how to do WhatsApp bot. so I took someone's code from the internet and tried to change it so it will fit my WhatsApp, the problem is when i run it he cant find the unread messages and always pressing the second chat i have in my chat list.
error line: list index out of range
here is the code, I hope you will be able to help me with this :)
source code: https://blog.usejournal.com/build-a-basic-news-fetching-whatsapp-bot-in-python-under-60-lines-of-code-2d992faf7f79
from logging import root
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import time
from urllib3.util import url
browser = webdriver.Firefox(executable_path='realPathIsHere')
browser.get('https://web.whatsapp.com')
def getNews():
text_box = browser.find_element_by_class_name("_3uMse")
response = "Let me fetch and send top 5 latest news:\n"
text_box.send_keys(response)
soup = BeautifulSoup(requests.get(url).content, "html5lib")
articles = soup.find_all('article',
class_="MQsxIb xTewfe R7GTQ keNKEd j7vNaf Cc0Z5d YKEnGe EyNMab t6ttFe Fm1jeb EjqUne")
news = [i.find_all('a', class_="ipQwMb Q7tWef")[0].text for i in articles[:5]]
links = [root + i.find('a')['href'][1:] for i in articles[:5]]
links = [requests.get("http://thelink.la/api-shorten.php?url=" + link).content.decode() for link in links]
for i in range(5):
text_box.send_keys(news[i] + "==>" + links[i] + "\n")
bot_users = {} # A dictionary that stores all the users that sent activate bot
while True:
unread = browser.find_elements_by_class_name("ZKn2B")
name, message = '', ''
if len(unread) > 0:
ele = unread[-1]
action = webdriver.common.action_chains.ActionChains(browser)
action.move_to_element_with_offset(ele, 0, -20) # move a bit to the left from the green dot
# Clicking couple of times because sometimes whatsapp web responds after two clicks
try:
action.click()
action.perform()
action.click()
action.perform()
except Exception as e:
pass
try:
name = browser.find_element_by_class_name("Pv-sE").text # Contact name
message = browser.find_elements_by_class_name("vW7d1")[-1]
if 'activate bot' in message.text.lower():
if name not in bot_users:
bot_users[name] = True
text_box = browser.find_element_by_class_name("_3uMse")
response = "Hi " + name + ". Tal's Bot here :). Now I am activated for you\n"
text_box.send_keys(response)
if name in bot_users:
if 'show' in message.text.lower() and 'news' in message.text.lower():
getNews()
if 'deactivate' in message.text.lower():
if name in bot_users:
text_box = browser.find_element_by_class_name("_3uMse")
response = "Bye " + name + ".\n"
text_box.send_keys(response)
del bot_users[name]
except Exception as e:
print(e)
pass
sleep(2) # A 2 second pause so that the program doesn't run too fast
I dont know why but this is working now :)
Related
I am trying to scrape a youtube channel and return all of the links for each video of this channel, however when I try to print out these links, I only get a few links that have nothing to do with the videos. I am suspecting the videos may be loaded by Javascript, so would there we a way to even do this with beautifulsoup? Will I have to use selenium? Can somebody please help me and do some testing. Here is my code so far:
import requests
from bs4 import BeautifulSoup
print('scanning page...')
youtuber = 'memeulous'
result = requests.get('https://www.youtube.com/c/' + youtuber + '/videos')
status = result.status_code
src = result.content
soup = BeautifulSoup(src, 'lxml')
links = soup.find_all('a')
if status == 200:
print('valid URL, grabbing uploads...')
else:
print('invalid URL, status code: ' + str(status))
quit()
print(links)
and here is my output:
scanning page...
valid URL, grabbing uploads...
[About, Press, Copyright, Contact us, Creators, Advertise, Developers, Terms, Privacy, Policy and Safety, How YouTube works, Test new features]
[Finished in 4.0s]
as you can see, no video links.
One way of doing this would be with the following code:
import requests
api_key = "PASTE_YOUR_API_KEY_HERE!"
yt_user = "memeulous"
api_url = f"https://www.googleapis.com/youtube/v3/channels?part=contentDetails&forUsername={yt_user}&key={api_key}"
response = requests.get(api_url).json()
playlist_id = response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
channel_url = f"https://www.googleapis.com/youtube/v3/playlistItems?" \
f"part=snippet%2CcontentDetails&maxResults=50&playlistId={playlist_id}&key={api_key}"
def get_video_ids(vid_data: dict) -> list:
return [_id["contentDetails"]["videoId"] for _id in vid_data["items"]]
def build_links(vid_ids: list) -> list:
return [f"https://www.youtube.com/watch?v={_id}" for _id in vid_ids]
def get_all_links() -> list:
all_links = []
url = channel_url
while True:
res = requests.get(url).json()
all_links.extend(build_links(get_video_ids(res)))
try:
paging_token = res["nextPageToken"]
url = f"{channel_url}&pageToken={paging_token}"
except KeyError:
break
return all_links
print(get_all_links())
This gets you all the video links (469) for the memeulous user.
['https://www.youtube.com/watch?v=4L8_isnyGfg', 'https://www.youtube.com/watch?v=ogpaiD2e-ss', 'https://www.youtube.com/watch?v=oH-nJe9XMN0', 'https://www.youtube.com/watch?v=kUcbKl4qe5g', ...
You can get the total video count from the videos_data object likes this:
print(f"Total videos: {videos_data['pageInfo']['totalResults']}")
I hope this helps and will get you started. All you need to do, is get the API key for the YouTube Data API.
I wrote a python code for web scraping so that I can import the data from flipkart.
I need to load multiple pages so that I can import many products but right now only 1 product page is coming.
from urllib.request import urlopen as uReq
from requests import get
from bs4 import BeautifulSoup as soup
import tablib
my_url = 'https://www.xxxxxx.com/food-processors/pr?sid=j9e%2Cm38%2Crj3&page=1'
uClient2 = uReq(my_url)
page_html = uClient2.read()
uClient2.close()
page_soup = soup(page_html, "html.parser")
containers11 = page_soup.findAll("div",{"class":"_3O0U0u"})
filename = "FoodProcessor.csv"
f = open(filename, "w", encoding='utf-8-sig')
headers = "Product, Price, Description \n"
f.write(headers)
for container in containers11:
title_container = container.findAll("div",{"class":"_3wU53n"})
product_name = title_container[0].text
price_con = container.findAll("div",{"class":"_1vC4OE _2rQ-NK"})
price = price_con[0].text
description_container = container.findAll("ul",{"class":"vFw0gD"})
product_description = description_container[0].text
print("Product: " + product_name)
print("Price: " + price)
print("Description" + product_description)
f.write(product_name + "," + price.replace(",","") +"," + product_description +"\n")
f.close()
You have to check if the next page button exist or not. If yes then return True, go to that next page and start scraping if no then return False and move to the next container. Check for the class name of that button first.
# to check if a pagination exists on the page:
def go_next_page():
try:
button = driver.find_element_by_xpath('//a[#class="<class name>"]')
return True, button
except NoSuchElementException:
return False, None
You can Firstly get the number of pages available and iterate over for each of the pages and parse the data respectively.
Like if you change the URL with respect to page
'https://www.flipkart.com/food-processors/pr?sid=j9e%2Cm38%2Crj3&page=1' which points to page 1
'https://www.flipkart.com/food-processors/pr?sid=j9e%2Cm38%2Crj3&page=2' which points to page 2
try:
next_btn = driver.find_element_by_xpath("//a//span[text()='Next']")
next_btn.click()
except ElementClickInterceptedException as ec:
classes = "_3ighFh"
overlay = driver.find_element_by_xpath("(//div[#class='{}'])[last()]".format(classes))
driver.execute_script("arguments[0].style.visibility = 'hidden'",overlay)
next_btn = driver.find_element_by_xpath("//a//span[text()='Next']")
next_btn.click()
except Exception as e:
print(str(e.msg()))
break
except TimeoutException:
print("Page Timed Out")
driver.quit()
For me, the easiest way is to add an extra loop with the "page" variable:
# just check the number of the last page on the website
page = 1
while page != 10:
print(f'Scraping page: {page}')
my_url = 'https://www.xxxxxx.com/food-processors/pr?sid=j9e%2Cm38%2Crj3&page={page}'
# here add the for loop you already have
page += 1
This method should work.
I have gotten this code off of a forum and I have installed all the dependencies but I get a few errors.
I dont know python very well so I decided to look it up, I was not able to fix it that way.
#all cats are yellow
from selenium import webdriver
from bs4 import BeautifulSoup
import time
#Quora Login Information
email=""
passy=""
# File With Questions Here
filey = "fileys.txt"
#Read File ,strip new lines ,return question list
def readFile(filey):
with open(filey, "r") as f:
q = f.readlines()
qlist = [x.strip() for x in q]
# qlist=reversed(qlist) #Will reverse the question list if needed
print len(qlist), "Total Questions Loaded"
return qlist
#Login to Quora
def login(email, passy):
print "Logging in..."
driver.get("http://quora.com")
# Create Soup Object and find all form_column classes
forms = BeautifulSoup(driver.page_source, "lxml").find_all(class_="form_column")
# Iterate through forms
# Find polymorphic id string,append a hashtag(#) to create css_selector
for form in forms:
try:
# This is for email/password entry box
data = form.find("input")["name"]
if data == "email":
email_css = "#" + form.find("input")["id"]
if data == "password":
password_css = "#" + form.find("input")["id"]
except:
pass
try:
# This is for the Login Button
data = form.find("input")["value"]
if data == "Login":
button_css = "#" + form.find("input")["id"]
except:
pass
driver.find_element_by_css_selector(email_css).send_keys(email)
driver.find_element_by_css_selector(password_css).send_keys(passy)
time.sleep(2)
driver.find_element_by_css_selector(button_css).click()
time.sleep(2)
# LOGIN FINISHED
#Create Question List
qlist = readFile(filey)
#Create Webdriver Vroom Vroom
driver = webdriver.Chrome()
#Total Questions Posted Counter
county=0
# Iterate through qlist ask questions till no more
for question in qlist:
try:
print question
driver.get("http://quora.com")
soup=BeautifulSoup(driver.page_source,"lxml")
# Find all text areas
blox = soup.find_all("textarea")
# Find polymorphic id string for Ask Question entry field
for x in blox:
try:
placeholder = x["placeholder"]
if placeholder.__contains__("Ask or Search Quora"): # Fix this later
askbar_css = "#" + x["id"]
print askbar_css
except:
pass
askbutton = "#" + soup.find(class_="AskQuestionButton")["id"]# Fix this later
# Type out Question
driver.find_element_by_css_selector(askbar_css).send_keys(question)
# Wait for askbutton to become clickable
time.sleep(.2) # Fix later
try:
driver.find_element_by_css_selector(askbutton).click()
except:
#Click Failed # Fix later
pass
# Find the popup
while True:
try:
soup = BeautifulSoup(driver.page_source, "lxml")
popExists = soup.find(class_="Modal AskQuestionModal")
break
except:
pass
soup = BeautifulSoup(driver.page_source,"lxml")
popup = "#" + soup.find(class_="submit_button modal_action")["id"]
driver.find_element_by_css_selector(popup).click()
for x in range(0,17):
time.sleep(.1)
try:
soup = BeautifulSoup(driver.page_source, "lxml")
popExists = soup.find(class_="PMsgContainer") #Found Popup
if str(popExists).__contains__("You asked"): #big no no
county += 1
break
except:
pass
print "county=>",county
except Exception,e:
print e
print "ERROR"
pass
So the code opens chrome and it loads quora, however it gets stuck on logging in and the script ends. I am also on a mac. I get the following error:
'NoneType' object has no attribute '__getitem__'
ERROR
I am new to web scraping so please forgive my ignorance.
I built a program to scrape Zillow, and everything has worked fine for the most part. My problem is I am using a proxy service called proxycrawl that easily allows me to integrate proxies into my program. This is done by placing https://api.proxycrawl.com/?token=xxx&url= before my actual URL. What I have noticed is that when the program clicks on an "a" tag, the URL changes to the example below:
Before:
Before Click
After:
After Click
Any 11 clicks through the program or manually result in the site changing to the proxycrawl site, where I get the 404 error. Any ideas?
#Browser open
print(".....Opening Browser.....")
Browser = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver')
Browser.maximize_window()
#browser page
url = urllib.parse.quote_plus('https://www.zillow.com/homes/for_sale/Bakersfield-CA-93312/house,mobile,land,townhouse_type/97227_rid/35.4606,-119.037467,35.317856,-119.200888_rect/12_zm/0_mmm/')
Browser.get('https://api.proxycrawl.com/?token=xxx&url=' + url)
print("Opening Zillow")
time.sleep(10)
last_page = int(Browser.find_element_by_xpath("""//ol[#class="zsg-pagination"]//li[last()-1]""").text)
#print last_page
page = 0
count = 0
csv_file = open('listings.csv','w')
fieldnames = ['address', 'price', 'zestimate', 'beds', 'baths', 'feet', 'desc', 'Type', 'year_built', 'heating', 'cooling', 'parking', 'lot',
'days_on_market', 'pricepsqr', 'saves', 'interior', 'spaces_amenities', 'construction', 'exterior', 'parking1', 'mls', 'other']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for i in range(last_page):
page = page + 1
n = 0
listings = Browser.find_elements_by_xpath("""//*[#id="search-results"]/ul/li""")
for i in range(len(listings)):
n = i + 1
listing_dict = {}
print("Scraping the listing number {0} on page {1}, the count is {2}".format(n, page, count))
if (count) % 11 == 0:
listings = Browser.find_elements_by_xpath('//*[#id="search-results"]/ul/li')
time.sleep(2)
try:
# Finds Listings
listings = Browser.find_elements_by_xpath("""//*[#id="search-results"]/ul/li""")
print("Looking Up listings")
# Opens Listing
listings[i].find_elements_by_tag_name('a')[0].click()
print("Opening Listing")
time.sleep(2)
# Opens "See More Tab"
Browser.find_element_by_partial_link_text('See More').click()
# Prepare for Scrape
time.sleep(2)
I did speak with proxycrawl, and they stated that the URL had to be encoded, which I did do with no luck. After encoding, I replied and got the following statement:
"You are sending your requests double encoded and your get a response of pc_status: 602. Those requests are failing and you should fix them. Please only encode the URLs once, encoding the URLs more than once will result in a failing request."
It look like the page is trying to redirect you relatively.
In this specific use case, you could hack your way around the encoding issue by doing something similar to the following
# https://api.proxycrawl.com/homes/for_sale/Test/one,two
x = driver.current_url
#/homes/for_sale/Test/one,two
r = x[26:]
# base url = https://api.proxycrawl.com/?token=xxx&url=
u = base_url + r
driver.get(u)
I am learning Beautiful Soup for Python and trying to parse a website "https://www.twitteraudit.com/". When I enter a twitter id in the search bar, it returns the results for some id in a fraction of seconds, but some id takes about a minute to process the data. In this case, how can I parse the HTML after it gets loaded or the result is done? And I tried to loop it, but it doesn't work that way. But what I figured was if I open a browser and load the web link and once its done it is storing the cache in the computer and the next time when I run for the same id it works perfectly.
Can anyone help me out with this? I appreciate the help. I attach the code below>>
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import re
from re import sub
def HTML(myURL):
uClient = uReq(myURL)
pageHTML = uClient.read()
uClient.close()
pageSoup = soup(pageHTML, "html.parser")
return pageSoup
def fakecheck(usr):
myURLfc = "https://www.twitteraudit.com/" + usr
pgSoup = HTML(myURLfc)
foll = pgSoup.findAll("div",{"class":"audit"})
link = foll[0].div.a["href"]
real = foll[0].findAll("span",{"class":"real number"})[0]["data-value"]
fake = foll[0].findAll("span",{"class":"fake number"})[0]["data-value"]
scr = foll[0].findAll("div",{"class":"score"})[0].div
scoresent = scr["class"][1]
score = re.findall(r'\d{1,3}',str(scr))[0]
return [link, real, fake, scoresent, score]
lis = ["BarackObama","POTUS44","ObamaWhiteHouse","MichelleObama","ObamaFoundation","NSC44","ObamaNews","WhiteHouseCEQ44","IsThatBarrak","obama_barrak","theprezident","barrakubama","BarrakObama","banackkobama","YusssufferObama","barrakisdabomb_","BarrakObmma","fuzzyjellymasta","BarrakObama6","bannalover101","therealbarrak","ObamaBarrak666","barrak_obama"]
for u in lis:
link, real, fake, scoresent, score = fakecheck(u)
print ("link : " + link)
print ("Real : " + real)
print ("Fake : " + fake)
print ("Result : " + scoresent)
print ("Score : " + score)
print ("=================")
I think the problem is some of the Twitter ID's have not yet been audited, and so I was getting an IndexError. However, putting the call to fakecheck(u) in a while True: loop that catches that error will continually check the website until an audit has been performed on that ID.
I put this code after the lis definition:
def get_fake_check(n):
return fakecheck(n)
for u in lis:
while True:
try:
link, real, fake, scoresent, score = get_fake_check(u)
break
except:
pass
I'm not sure if there is a way to automate the audit request on the website, but when a query is waiting, I manually clicked the "Audit" button on the website for that ID, and once the audit was completed, the script continued as usual until all ID audits were processed.