how to solve problems with twitter scraping code

how to solve problems with twitter scraping code - python

So I have been using python for twitter scraping using selenium driver. In my code most part of code is working, when I am only scraping a single tweet it's working. But when I am repeating the process for more than one tweet its not working. It is scrolling through the new tweets but not scraping anything. It is not saving anything. I have used this code : https://github.com/israel-dryer/Twitter-Scraper/blob/main/twitter-scraper-tut.ipynb
Please help me to correct the mistake in my code. Here is my code:
import time
import csv
from getpass import getpass
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import Chrome
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get('https://www.twitter.com/login')
driver.maximize_window()
time.sleep(5)
username = driver.find_element(by=By.XPATH, value='//input[#name="text"]')
username.send_keys('ABC')
username.send_keys(Keys.RETURN)
time.sleep(5)
password = driver.find_element(by=By.XPATH, value='//input[#name="password"]')
password.send_keys('XYZ')
password.send_keys(Keys. RETURN)
time.sleep(5)
search_input = driver.find_element(by=By.XPATH, value='//input[#aria-label="Search query"]')
# '//input[#aria-label-"Search query"]'
search_input.send_keys('#NEET')
search_input.send_keys(Keys.RETURN)
time.sleep(5)
driver.find_element(by=By.LINK_TEXT,value='Latest').click()
time.sleep(5)
def get_tweet_data(card):
find_username=card.find_element(by=By.XPATH, value='.//span').text
try:
find_twitter=card.find_element(by=By.XPATH, value='.//span[contains(text(), "#")]').text
except NoSuchElementException:
return
try:
find_date=card.find_element(by=By.XPATH, value='.//time').get_attribute('datetime')
except NoSuchelementException :
return
find_tweets1=card.find_element(by=By.XPATH, value='.//div[2]/div[2]/div[1]').text
find_tweets2=card.find_element(by=By.XPATH, value='.//div[2]/div[2]/div[2]').text
final_tweet=find_tweets1+find_tweets2
find_retweet=card.find_element(by=By.XPATH, value='.//div[#data-testid="retweet"]').text
find_reply=card.find_element(by=By.XPATH, value='.//div[#data-testid="reply"]').text
find_likes=card.find_element(by=By.XPATH, value='.//div[#data-testid="like').text
tweet=(find_username,find_twitter,find_date,final_tweet,find_retweet,find_reply,find_likes)
return tweet
data = []
tweet_ids = set()
last_position = driver.execute_script("return window.pageYOffset;")
scrolling = True
while scrolling:
page_cards = driver.find_elements(by=By.XPATH, value='//div[#data-testid="tweet"]')
for card in page_cards[-15:]:
tweet = get_tweet_data(card)
if tweet:
tweet_id = ''.join(tweet)
if tweet_id not in tweet_ids:
tweet_ids.add(tweet_id)
data.append(tweet)
scroll_attempt = 0
while True:
# check scroll position
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(5)
curr_position = driver.execute_script("return window.pageYOffset;")
if last_position == curr_position:
scroll_attempt += 1
# end of scroll region
if scroll_attempt >= 3:
scrolling = False
break
else:
time.sleep(5) # attempt another scroll
else:
last_position = curr_position
break
# close the web driver
driver.close()
with open('scrappeddata1.csv', 'w', newline='', encoding='utf-8') as f:
header = ['find_username','find_twitter','find_date','final_tweet','find_retweet','find_reply','find_likes']
writer = csv.writer(f)
writer.writerow(header)
writer.writerows(data)
Please help to solve the issue

Related

how can I solve this 'NoSuchElementException' error?

I was doing some crawling stuff with selenium.
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd
instagram_id="username"
instagram_pw="password"
_id = driver.find_element(By.NAME, 'username')
_id.send_keys(instagram_id)
time.sleep(2)
_password = driver.find_element(By.NAME, 'password')
_password.send_keys(instagram_pw)
time.sleep(2)
login_button = driver.find_element(By.CSS_SELECTOR, '.sqdOP.L3NKy.y3zKF').click()
time.sleep(5) #press login button
_keyword = '교토'
driver.get('https://www.instagram.com/explore/tags/' + _keyword + '/') #instagram serch
driver.find_element(By.CSS_SELECTOR, 'div.v1Nh3.kIKUG._bz0w').click()
time.sleep(5) #open first post
There was no problem so far
But in here, NoSuchElementException Error occurs.
results = []
count = 200
for i in range(count):
data = driver.find_elements(By.CSS_SELECTOR, 'a.xil3i') #save hashtag info
for j in range(len(data)):
results.append(data[j].text.replace("#","")) #remove'#'
if (i+1)%10 == 0:
print('{}번째 게시물 완료'.format(i+1))
driver.find_element(By.CSS_SELECTOR, 'a._65Bje.coreSpriteRightPaginationArrow').click() #다음 게시물로 이동
time.sleep(5)
help me fix that error plz.
Thanks

How can I take a line from a file for each thread in multiprocessing separately?

I encountered such a problem that I can not take a separate string from the file for each individual thread in multiprocessing. It turns out only to take a line of one line for all the threads at the same time.
In other words, I want to: First thread = take first line of file; Second thread = take second line of file, etc.
from multiprocessing import Pool
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
import pyperclip
import time
from selenium.webdriver.common.by import By
import multiprocessing
password = "12345"
with open("Token.txt") as f:
content = f.readlines()
content = [x.strip() for x in content]
def get_data(url):
try:
service = Service(r"D:\pythonProject\Test\chromedriver\chromedriver.exe")
chrome_options = Options()
chrome_options.add_extension(
"metamask.crx")
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.get(url=url)
time.sleep(5)
driver.switch_to.window(driver.window_handles[1])
driver.get("chrome-extension://nkbihfbeogaeaoehlefnkodbefgpgknn/home.html")
time.sleep(2)
driver.switch_to.window(driver.window_handles[1])
driver.get("chrome-extension://nkbihfbeogaeaoehlefnkodbefgpgknn/home.html")
time.sleep(2)
start_metamask_button = driver.find_element(By.XPATH,
'//*[#id="app-content"]/div/div[2]/div/div/div/button').click()
time.sleep(3)
import_wallet = driver.find_element(By.XPATH,
'//*[#id="app-content"]/div/div[2]/div/div/div[2]/div/div[2]/div[1]/button').click()
time.sleep(3)
agreed = driver.find_element(By.XPATH,
'//*[#id="app-content"]/div/div[2]/div/div/div/div[5]/div[1]/footer/button[2]').click()
time.sleep(3)
select_word_phrase = Select(
driver.find_element(By.XPATH, '//*[#id="app-content"]/div/div[2]/div/div/div[2]/form/div[1]/div[2]/select'))
select_word_phrase.select_by_visible_text('I have a 24-word phrase')
time.sleep(3)
search_elem = driver.find_element(By.XPATH, '//*[#id="import-srp__srp-word-0"]').click()
time.sleep(2)
pyperclip.copy(content)
print(content)
except Exception as ex:
print(ex)
finally:
driver.quit()
if __name__ == '__main__':
process_count = 3
url = 'chrome-extension://nkbihfbeogaeaoehlefnkodbefgpgknn/home.html'
urls_list = [url] * process_count
print(urls_list)
p = Pool(processes=process_count)
p.map(get_data, urls_list)

Python 3 For Loop running infinitely? Why does my code not finish if it was iterated over all items in the list/set?

I am trying to scrape all the video links and its corresponding titles from a facebook like page. Then I want to do a mapping where I map the videoId to its corresponding title. I want to do so, because then I can automatically download the videos and reupload them on youtube with the correct title. For some reason my code would not finish and the videoIds and its title would not get written into a json file called videoTitles.json. It just keeps running running and running.
I also tried to stop the for loops by:
if index+1==len(videoLinks):
break
Here is the code:
from asyncio.windows_events import NULL
from msilib.schema import Error
import shutil
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from youtube_dl.utils import UnsupportedError, DownloadError
import os
import json
import youtube_dl
import ffmpeg
from datetime import datetime
import re
import sys
import argparse
import requests
import time
import os.path
chrome_options = Options()
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-popup-blocking")
# disable the banner "Chrome is being controlled by automated test software"
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
url=input("Facebook-Page: ")
# global driver
driver = webdriver.Chrome( options=chrome_options)
driver.get(url)
button=WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//span[text()='Alle Cookies gestatten']"))).click()
driver.implicitly_wait(10)
links=[]
links_videos_dl=[]
titles=[]
SCROLL_PAUSE_TIME = 1
videoLinks=set()
videoTitles=set()
info=dict()
videoTitleMapping=dict()
while True:
last_height = driver.execute_script("return document.body.scrollHeight")
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
links = [elem.get_attribute("href") for elem in driver.find_elements(By.XPATH,"//span/div/a")]
titles=[elem.get_attribute("textContent") for elem in driver.find_elements(By.XPATH,"//span/div/a/span/span")]
for index,link in enumerate(links):
if link and index < 16:
if url in link:
videoLinks.add(link)
for index,title in enumerate(titles):
if index < 16:
videoTitles.add(title)
print("LÄNGELINKS", len(videoLinks))
print('LÄNGETITLE', len(videoTitles))
for index, title in enumerate(videoTitles):
if index+1==len(videoTitles):
break
for index, videoLink in enumerate(videoLinks):
input_video=None
input_audio=None
if index+1==len(videoLinks):
break
print('videoLInksLoop')
ydl_opts = {"format":"bestvideo/best","outtmpl":"%(id)s.%(ext)s", "ignoreerrors":True}
try:
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([videoLink])
info= ydl.extract_info(videoLink, download=False, process=True)
print('INFO!!!', info['id'])
videoTitleMapping[info['id']]=title
jsonVideoTitleMapping=json.dumps(videoTitleMapping, indent=4)
if os.path.isfile(f"{info['id']}.mp4"):
input_video = ffmpeg.input(f"{info['id']}.mp4")
else:
input_video = ffmpeg.input(f"{info['id']}.webm")
ydl_opts = {"format":"bestaudio/best","outtmpl":"%(id)s.%(ext)s", "ignoreerrors":True}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([videoLink])
print(videoLink)
info= ydl.extract_info(videoLink, download=False, process=True)
if os.path.isfile(f"{info['id']}.m4a"):
input_audio = ffmpeg.input(f"{info['id']}.m4a")
if os.path.isfile(f"./processed/final_{info['id']}.mp4"):
print("File already downloaded")
else:
if input_audio==None:
if os.path.isfile(f"{info['id']}.webm"):
shutil.move(f"{info['id']}.webm", f"./processed/final_{info['id']}.webm")
else:
shutil.move(f"{info['id']}.mp4", f"./processed/final_{info['id']}.mp4")
else:
print('prossing')
if input_video and input_audio:
ffmpeg.concat(input_video, input_audio, v=1, a=1).output(f"./processed/final_{info['id']}.mp4").run()
except:
print('failed')
finally:
print('Done')
with open("videoTitles.json", "w") as outfile:
outfile.write(jsonVideoTitleMapping)
It keeps printing out:
if os.path.isfile(f"./processed/final_{info['id']}.mp4"):
print("File already downloaded")
and:
finally:
print('Done')
What I am doing wrong here? Thanks a lot :)

Python selenium - WebDriverException: target frame detached - for Login button

I am trying to automate login to the website https://research.axiscapital.co.in/.
I am able to add username and password. I have also automated solving the captcha. But after it solves the captcha, I am unable to click the login button. I get the WebDriverException: target frame detached exception. I am adding the code below (without the real username and password) for assistance.
NOTE: As soon as the captcha verification expires, the login button becomes clickable again. Kindly help me with it.
import requests
import time
import os
# Added for Selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
# TO MAKE THE SCRAPING FASTER
chrome_options = Options()
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(chrome_options=chrome_options)
driver = webdriver.Chrome()
driver.maximize_window()
driver.get("https://research.axiscapital.co.in/")
filename = '1.mp3'
delayTime = 2
googleIBMLink = 'https://speech-to-text-demo.ng.bluemix.net/'
audioToTextDelay = 10
def audioToText(mp3Path):
print("1")
driver.execute_script('''window.open("","_blank");''')
driver.switch_to.window(driver.window_handles[1])
print("2")
driver.get(googleIBMLink)
delayTime = 10
# Upload file
time.sleep(1)
print("3")
# Upload file
time.sleep(1)
root = driver.find_element_by_id('root').find_elements_by_class_name('dropzone _container _container_large')
btn = driver.find_element(By.XPATH, '//*[#id="root"]/div/input')
btn.send_keys('D:\\blogs\\1.mp3')
# Audio to text is processing
time.sleep(delayTime)
#btn.send_keys(path)
print("4")
# Audio to text is processing
time.sleep(audioToTextDelay)
print("5")
text = driver.find_element(By.XPATH, '//*[#id="root"]/div/div[7]/div/div/div').find_elements_by_tag_name('span')
print("5.1")
result = " ".join( [ each.text for each in text ] )
print("6")
driver.close()
driver.switch_to.window(driver.window_handles[0])
print("7")
return result
def saveFile(content,filename):
with open(filename, "wb") as handle:
for data in content.iter_content():
handle.write(data)
wait = WebDriverWait(driver,60)
wait.until(EC.element_to_be_clickable((By.XPATH, '//input[#id="Username"]'))).send_keys(username)
wait.until(EC.element_to_be_clickable((By.XPATH, '//input[#name="Password"]'))).send_keys(password)
time.sleep(1)
googleClass = driver.find_elements_by_class_name('g-recaptcha')[0]
time.sleep(2)
outeriframe = googleClass.find_element_by_tag_name('iframe')
time.sleep(1)
outeriframe.click()
time.sleep(2)
allIframesLen = driver.find_elements_by_tag_name('iframe')
time.sleep(1)
audioBtnFound = False
audioBtnIndex = -1
for index in range(len(allIframesLen)):
driver.switch_to.default_content()
iframe = driver.find_elements_by_tag_name('iframe')[index]
driver.switch_to.frame(iframe)
driver.implicitly_wait(delayTime)
try:
audioBtn = driver.find_element_by_id('recaptcha-audio-button') or driver.find_element_by_id('recaptcha-anchor')
audioBtn.click()
audioBtnFound = True
audioBtnIndex = index
break
except Exception as e:
pass
if audioBtnFound:
try:
while True:
href = driver.find_element_by_id('audio-source').get_attribute('src')
response = requests.get(href, stream=True)
saveFile(response,filename)
response = audioToText(os.getcwd() + '/' + filename)
print(response)
driver.switch_to.default_content()
iframe = driver.find_elements_by_tag_name('iframe')[audioBtnIndex]
driver.switch_to.frame(iframe)
inputbtn = driver.find_element_by_id('audio-response')
inputbtn.send_keys(response)
inputbtn.send_keys(Keys.ENTER)
time.sleep(2)
errorMsg = driver.find_elements_by_class_name('rc-audiochallenge-error-message')[0]
if errorMsg.text == "" or errorMsg.value_of_css_property('display') == 'none':
print("Success")
break
except Exception as e:
print(e)
print('Caught. Need to change proxy now')
else:
print('Button not found. This should not happen.')
time.sleep(4)
wait.until(EC.element_to_be_clickable((By.XPATH, '//button[text()="Login"]'))).click()

You forgot to switch to default context after successfully completing the captcha.
put driver.switch_to.default_content() before break.
Edit: the success block would look like this.
print("Success")
driver.switch_to.default_content()
break

Scraping Tripadvisor reviews of all pages of a particular hotel with Python and Selenium

I am using Python and Selenium to scrape tripadvisor all the reviews of a particular hotel and I am new to scraping. But currently it's scraping reviews from first 6 pages out of 36 pages. I need to scrape the reviews from all the pages in that hotel and save them into a csv file. Following is the code I'm using.
import csv
import time
import requests
import re
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver
driver = webdriver.Chrome("./chromedriver")
def check_exists_by_xpath(xpath):
try:
driver.find_element_by_xpath(xpath)
except NoSuchElementException:
return False
return True
time.sleep(2)
def getHotelReviews():
# Find and click the More link (to load all reviews)
driver.find_element_by_xpath("//span[#class='_33O9dg0j']").click()
time.sleep(20)
reviews = driver.find_elements_by_xpath("//div[#data-test-target='reviews-tab']/div")
reviews_count = len(reviews)
print(reviews_count)
# Loop through the reviews found
for i in range(2, reviews_count):
try:
if (check_exists_by_xpath(".//div[contains(#class,'_2f_ruteS _1bona3Pu _2uD5bLZZ')]/div[2]/div/span[1]")):
moreBtn = reviews[i].find_element_by_xpath(
".//div[contains(#class,'_2f_ruteS _1bona3Pu _2uD5bLZZ')]/div[2]/div/span[1]").click()
time.sleep(20)
if (check_exists_by_xpath(".//div[contains(#class,'_2f_ruteS _1bona3Pu')]/div/q/span")):
review = reviews[i].find_element_by_xpath(
".//div[contains(#class,'_2f_ruteS _1bona3Pu')]/div/q/span").text
print(review)
date = reviews[i].find_element_by_xpath(".//span[contains(#class,'_34Xs-BQm')]").text
print(date)
title = reviews[i].find_element_by_xpath(".//div[contains(#class,'glasR4aX')]/a/span").text
print(title)
# Save to CSV
csvWriter.writerow((date, title, review))
except:
break
driver.close()
driver.switch_to.window(driver.window_handles[0])
def getHotelPages(url):
driver.get(url)
# to maximize the driver
driver.maximize_window()
nextPage = driver.find_elements_by_xpath("//a[contains(#class,'pageNum cx_brand_refresh_phase2 ')]")
noOfPages = len(nextPage)
print(noOfPages)
for i in range(noOfPages):
print(nextPage[i].get_attribute("href"))
URLs.append(nextPage[i].get_attribute("href"))
URLs = [
'https://www.tripadvisor.com/Hotel_Review-g304141-d3895228-Reviews-The_Hideout_Sigiriya-Sigiriya_Central_Province.html#REVIEWS']
# Prepare CSV file
csvFile = open("hideoutSigiriyab_reviews1.csv", "w", newline='', encoding="utf-8")
csvWriter = csv.writer(csvFile)
csvWriter.writerow(['Date', 'Title', 'Review'])
try:
getHotelPages(URLs[0])
except:
print("Error!!")
time.sleep(60)
for url in URLs:
driver.execute_script("window.open('');")
driver.switch_to.window(driver.window_handles[1])
driver.get(url)
getHotelReviews()
time.sleep(20)
csvFile.close()
driver.close()
Can you help me by suggesting a method or a working code to scrape the reviews from all the pages of a hotel.

Simple way to click pages 1-36.
size=int(driver.find_element_by_css_selector('div.pageNumbers >a:nth-last-child(1)').text)
for i in range(2,size):
pageNums=WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.pageNumbers")))
pageNums.find_element_by_xpath("//a[text()='{}']".format(i)).click()
time.sleep(5)
Import
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

how to solve problems with twitter scraping code - python

Related

how can I solve this 'NoSuchElementException' error?

How can I take a line from a file for each thread in multiprocessing separately?

Python 3 For Loop running infinitely? Why does my code not finish if it was iterated over all items in the list/set?

Python selenium - WebDriverException: target frame detached - for Login button

Scraping Tripadvisor reviews of all pages of a particular hotel with Python and Selenium

Categories

Resources