Web scraping twitter - python

I want to do web scraping on twitter page to download tweets on a specific search word. I am not able to fetch recursively all the tweets, rather I can fetch 20 tweets. Please help to fetch all the tweets recursively. Below is the code
from bs4 import BeautifulSoup
import requests
import pandas as pd
company_name = 'ABC'
url = 'https://twitter.com/search?q=%23%27%20%20%20' + company_name + '&src=typd&lang=en'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
req = requests.get(url, headers=headers);#print(req)
data = req.text;# print(data)
# soup = BeautifulSoup(data, "lxml");# print(soup)
soup = BeautifulSoup(data, "html.parser");# print(soup)
tweets = [p.text for p in soup.findAll('p', class_='tweet-text')]
# print(tweets)
df = pd.DataFrame()
df['Tweet'] = tweets
print(df.head())
print(df.shape)

Related

getting an empty list when trying to extract urls from google with beautifulsoup

I am trying to extract the first 100 urls that return from a location search in google
however i am getting an empty list every time ("no results found")
import requests
from bs4 import BeautifulSoup
def get_location_info(location):
query = location + " information"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
url = "https://www.google.com/search?q=" + query
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = soup.find_all("div", class_="r")
websites = []
if results:
counter = 0
for result in results:
websites.append(result.find("a")["href"])
counter += 1
if counter == 100:
break
else:
print("No search results found.")
return websites
location = "Athens"
print(get_location_info(location))
No search results found.
[]
I have also tried this approach :
import requests
from bs4 import BeautifulSoup
def get_location_info(location):
query = location + " information"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
url = "https://www.google.com/search?q=" + query
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = soup.find_all("div", class_="r")
websites = [result.find("a")["href"] for result in results][:10]
return websites
location = "sifnos"
print(get_location_info(location))`
and i get an empty list. I think i am doing everything suggested in similar posts but i still get nothing
Always and first of all, take a look at your soup to see if all the expected ingredients are in place.
Select your elements more specific in this case for example with css selector:
[a.get('href') for a in soup.select('a:has(>h3)')]
To void consent banner also send some cookies:
cookies={'CONSENT':'YES+'}
Example
import requests
from bs4 import BeautifulSoup
def get_location_info(location):
query = location + " information"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
url = "https://www.google.com/search?q=" + query
response = requests.get(url, headers=headers, cookies={'CONSENT':'YES+'})
soup = BeautifulSoup(response.text, 'html.parser')
websites = [a.get('href') for a in soup.select('a:has(>h3)')]
return websites
location = "sifnos"
print(get_location_info(location))
Output
['https://www.griechenland.de/sifnos/', 'http://de.sifnos-greece.com/plan-trip-to-sifnos/travel-information.php', 'https://www.sifnosisland.gr/', 'https://www.visitgreece.gr/islands/cyclades/sifnos/', 'http://www.griechenland-insel.de/Hauptseiten/sifnos.htm', 'https://worldonabudget.de/sifnos-griechenland/', 'https://goodmorningworld.de/sifnos-griechenland/', 'https://de.wikipedia.org/wiki/Sifnos', 'https://sifnos.gr/en/sifnos/', 'https://www.discovergreece.com/de/cyclades/sifnos']

How do I scrap all movie title, date and reviews on the website below? https://www.nollywoodreinvented.com/list-of-all-reviews

I have tried with the code below and what the code does is to bring the first page and does not load completely the reviews for the movies. I am interested in getting all the movie titles, movie dates, and reviews.
enter code here
from bs4 import BeautifulSoup
import requests
url = 'https://www.nollywoodreinvented.com/list-of-all-reviews'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text, 'lxml')
movie_div = soup.find_all('div', class_='article-panel')
title=[]
for div in movie_div:
images= div.find_all('div', class_='article-image-wrapper')
for image in images:
image = image.find_all('div', class_='article-image')
for img in image:
title.append(img.a.img['title'])
date =[]
for div in movie_div:
date.append(div.find('div', class_='authorship type-date').text.strip())
info =[]
for div in movie_div:
info.append(div.find('div', class_='excerpt-text').text.strip())
import pandas as pd
movie = pd.DataFrame({'title':title, 'date':date, 'info':info}, index=None)
movie.head()
There is a backend api which serves up the HTML you are scraping you can see it in action if you open your browsers Developer Tools - Network tab - fetch/Xhr and click the on a the 2nd or 3rd page link, we can recreate the POST request with python like the below:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
pages = 3
results_per_page = 500 #max 500 I think
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
url = 'https://www.nollywoodreinvented.com/wp-admin/admin-ajax.php'
output = []
for page in range(1,pages+1):
payload = {
'action':'itajax-sort',
'view':'grid',
'loop':'main loop',
'location':'',
'thumbnail':'1',
'rating':'1',
'meta':'1',
'award':'1',
'badge':'1',
'authorship':'1',
'icon':'1',
'excerpt':'1',
'sorter':'recent',
'columns':'4',
'layout':'full',
'numarticles':str(results_per_page),
'largefirst':'',
'paginated':str(page),
'currentquery[category__in][]':'2648',
'currentquery[category__in][]':'2649'
}
resp = requests.post(url,headers=headers,data=payload).json()
print(f'Scraping page: {page} - results: {results_per_page}')
soup = BeautifulSoup(resp['content'],'html.parser')
for film in soup.find_all('div',class_='article-panel'):
try:
title = film.find('h3').text.strip()
except AttributeError:
continue
date = datetime.strptime(film.find('span',class_='date').text.strip(),"%B %d, %Y").strftime('%Y-%m-%d')
likes = film.find('span',class_='numcount').text.strip()
if not likes:
likes = 0
full_stars = [1 for _ in film.find_all('span',class_='theme-icon-star-full')]
half_stars = [0.5 for _ in film.find_all('span',class_='theme-icon-star-half')]
stars = (sum(full_stars)+ sum(half_stars))/2.0
item = {
'title':title,
'date':date,
'likes':likes,
'stars':stars
}
output.append(item)
df= pd.DataFrame(output)
df.to_csv('nollywood_data.csv',index=False)
print('Saved to nollywood_data.csv')

Fix for missing 'tr' class in webscraping

I'm trying to webscrape different stocks by rows, with the data scraped from https://www.slickcharts.com/sp500. I am following a tutorial using a similar website, however that website uses classes for each of its rows, while mine doesn't (attached below).
This is the code I'm trying to use, however I don't get any output whatsoever. I'm still pretty new at coding so any feedback is welcome.
import requests
import pandas as pd
from bs4 import BeautifulSoup
company = []
symbol = []
url = 'https://www.slickcharts.com/sp500' #Data from SlickCharts
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
rows = soup.find_all('tr')
for i in rows:
row = i.find_all('td')
print(row[0])
First of all, you need to add some headers to your request because most likely you get the same as me: status code 403 Forbidden. It's because the website is blocking your request. Adding User-Agent does the trick:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
page = requests.get(url, headers=headers)
Then you can iterate over tr tags as you do. But you should be careful, because, for example first tr doesn't have td tags and you will get exception in the row:
print(row[0])
Here is the example of code that prints names of all companies:
import requests
from bs4 import BeautifulSoup
company = []
symbol = []
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
url = 'https://www.slickcharts.com/sp500' #Data from SlickCharts
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
rows = soup.find_all('tr')
for row in rows:
all_td_tags = row.find_all('td')
if len(all_td_tags) > 0:
print(all_td_tags[1].text)
But this code also outputs some other data besides company names. It's because you are iterating over all tr tags on the page. But you need to iterate over a specific table only (first table on the page in this case).
import requests
from bs4 import BeautifulSoup
company = []
symbol = []
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
url = 'https://www.slickcharts.com/sp500' #Data from SlickCharts
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
first_table_on_the_page = soup.find('table')
rows = first_table_on_the_page.find_all('tr')
for row in rows:
all_td_tags = row.find_all('td')
if len(all_td_tags) > 0:
print(all_td_tags[1].text)

How to get all 'href' with soup in python ? I try so many times but not work

How to get all 'href' with soup in python ? I try so many times but in vain.
Whatever I use 'soup.find' or 'soup.find_all' method to strugle for the 'href', it doesn't work.
python version:3.10
!pip install requests
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
productlink = []
headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Mobile Safari/537.36'}
for page in range(1,2):
url = "https://www.momomall.com.tw/s/103487/dcategory/all/3/{page}"
r = requests.get(url, headers = headers)
Soup = BeautifulSoup(r.text,"lxml")
for link in Soup.find_all('ul',class_="searchItem Stype"):
print(len(link))
Link = link.li.a
LINK = Link.get('href')
print(LINK)
productlink.append(LINK)
print(productlink)
sorry i misunderstood totally your problem. find_all is not a very versatile tool and you were searching for the wrong ul
i barely changed your code but it seems to work now
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
productlink = []
headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Mobile Safari/537.36'}
for page in range(1,2):
url = f"https://www.momomall.com.tw/s/103487/dcategory/all/3/{page}"
r = requests.get(url, headers = headers)
Soup = BeautifulSoup(r.text,"lxml")
for link in Soup.select('ul#surveyContent > li > a[href]:first-of-type'):
print(len(link))
# ~ Link = link.li.a
LINK = link.get('href')
print(LINK)
productlink.append(LINK)
print(productlink)
for page in range(1,2):
url = "https://m.momomall.com.tw/m/store/DCategory.jsp?entp_code=103487&category_code=all&orderby=3&page={}".format(page)
r = requests.get(url,headers = headers)
soup = BeautifulSoup(r.text,'lxml')
for goods_code in soup.select('a.nofollowBtn_star'):
Goods_code = 'https://www.momomall.com.tw/s/103487/'+goods_code.get('goods_code')+'/'
goodlink.append(Goods_code)
for URL in goodlink:
R = requests.get(URL, headers = headers)
Soup = BeautifulSoup(R.text,"lxml")
for dataprice in Soup.select('script'):
import re
discount_regex=re.compile('discountPrice = (\d{1,5})')
print(re.search(discount_regex, dataprice).group(1))
```

Trouble using pandas read_html() : ValueError

from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
url = "https://finance.naver.com/item/sise_day.nhn?code=068270&page=1"
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'}
res = requests.get(url, verify=True, headers=headers)
with urlopen(url) as doc:
html = BeautifulSoup(res.text, 'lxml')
pgrr = html.find('td', class_='pgRR')
s = str(pgrr.a['href']).split('=')
last_page = s[-1]
df = pd.DataFrame()
sise_url = 'http://finance.naver.com/item/sise_day.nhn?code=068270'
for page in range(1, int(last_page)+1):
page_url = '{}&page={}'.format(sise_url, page)
df = df.append(pd.read_html(page_url, encoding='euc-kr', header='0')[0])
df = df.dropna() # 값이 빠진 행을 제거한다.
print(df)
I'm having this Value error while crawling the Daily stock data in Naver Finance.
I have no trouble getting the url but if i use the read_html() i have Value Error:Table not found issue from the line df = df.append(pd.read_html(page_url, encoding='euc-kr', header='0')[0]). Pls give some advice.
I don't read Korean... however pd.read_html() was getting an error page. Resolved this by requests.get() with headers. Then pass res.text to read_html()
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import pandas as pd
url = "https://finance.naver.com/item/sise_day.nhn?code=068270&page=1"
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'}
res = requests.get(url, verify=True, headers=headers)
with urlopen(url) as doc:
html = BeautifulSoup(res.text, 'lxml')
pgrr = html.find('td', class_='pgRR')
s = str(pgrr.a['href']).split('=')
last_page = s[-1]
df = pd.DataFrame()
sise_url = 'http://finance.naver.com/item/sise_day.nhn?code=068270'
for page in range(1, int(last_page)+1):
page_url = '{}&page={}'.format(sise_url, page)
res = requests.get(page_url, verify=True, headers=headers)
df = df.append(pd.read_html(res.text, encoding='euc-kr')[0])

Categories