How to print all results of Beautiful Soup at once? - python

I have a list of twitter usernames. I need to get their number of followers. I used BS and requests. However, I've only received one account every time.
from bs4 import BeautifulSoup
import requests
import pandas as pd
purcsv = pd.read_csv('pureeng.csv', engine= 'python')
followers = purcsv['username']
followers.head(10)
handle = purcsv['username'][0:40]
temp = ("https://twitter.com/"+handle)
temp = temp.tolist()
for url in temp:
page = requests.get(url)
bs = BeautifulSoup(page.text,'lxml')
follow_box = bs.find('li',{'class':'ProfileNav-item ProfileNav-item--followers'})
followers = follow_box.find('a').find('span',{'class':'ProfileNav-value'})
print("Number of followers: {} ".format(followers.get('data-count')))

That's because you are looping over the urls first and fetching the content for each in the same variable page here:
for url in temp:
page = requests.get(url)
so page will always contain the last url page accessed, to solve this you need to process a page once fetched
followers_list = []
for url in temp:
page = requests.get(url)
bs = BeautifulSoup(page.text, "html.parser")
follow_box = bs.find('li',{'class':'ProfileNav-item ProfileNav-item--followers'})
followers = follow_box.find('a').find('span',{'class':'ProfileNav-value'})
print("Number of followers: {} ".format(followers.get('data-count')))
followers_list.append(followers.get('data-count'))
print(followers_list)
here is a full example to verify
from bs4 import BeautifulSoup
import requests
import pandas as pd
purcsv = pd.read_csv('pureeng.csv')
followers = purcsv['username']
handles = purcsv['username'][0:40].tolist()
followers_list = []
for handle in handles:
url = "https://twitter.com/" + handle
try:
page = requests.get(url)
except Exception as e:
print(f"Failed to fetch page for url {url} due to: {e}")
continue
bs = BeautifulSoup(page.text, "html.parser")
follow_box = bs.find('li',{'class':'ProfileNav-item ProfileNav-item--followers'})
followers = follow_box.find('a').find('span',{'class':'ProfileNav-value'})
print("Number of followers: {} ".format(followers.get('data-count')))
followers_list.append(followers.get('data-count'))
print(followers_list)
output:
Number of followers: 13714085
Number of followers: 4706511
['13714085', '4706511']
You may consider using async function for fetching and processing those urls if you have two many of them.

Related

Python: I want to build up an email extractor but only got one email address

I am a beginner and want to collect email addresses from government employees. I ran the following codes and got only one email address. Ideally the output should contain about 30,000 email addresses. Additionally, how to save the output into an excel file?
The URL I put into the following codes:
https://www.google.com/search?q=county+assessor%27s+office
Codes:
from bs4 import BeautifulSoup
import requests
import requests.exceptions
import urllib.parse
from collections import deque
import re
user_url = str(input('[+] Enter Target URL To Scan:'))
urls = deque ([user_url])
scraped_urls = set()
emails =set ()
count = 0
try:
while len(urls):
count += 1
if count == 100000:
break
url=urls.popleft()
scraped_urls.add(url)
parts = urllib.parse.urlsplit(url)
base_url = '{0.scheme}://{0.netloc}'.format(parts)
path = url[:url.rfind('/')+1] if '/' in parts.path else url
print('[%d] Processing %s' % (count, url))
try:
response = requests.get(url)
except(requests.exceptions.MissingSchema, requests.exceptions, ConnectionError):
continue
new_emails = set(re.findall(r"[a-z0-9\.\-+_]+#[a-z0-9\.\-+_]+\.[a-z]+",response.text, re.I))
emails.update(new_emails)
soup = BeautifulSoup(response.text, features="lxml")
for anchor in soup.find_all("a"):
link = anchor.attrs['href'] if 'herf' in anchor.attrs else''
if link.startswith('/'):
link = base_url + link
elif not link.startswith('http'):
link = path + link
if not link in urls and not link in scraped_urls:
urls.append(link)
except KeyboardInterrupt:
print('[-]Closing!')
for mail in emails:
print(mail)
Thanks in advance for any help you can provide!

how to scrape author name and author url from a webpage using python

i am trying to scrape author name and author url from the following webpage.
https://medium.com/javascript-scene/top-javascript-frameworks-and-topics-to-learn-in-2019-b4142f38df20?source=tag_archive
and i am using following code;
author_flag = 0
divs = soup.find_all('h2')
for div in divs:
author = div.find('a')
if(author is not None):
author_art.append(author.text)
author_url.append('https://medium.com'+ author.get('href'))
aurhor_flag = 1
break
if(author_flag==0):
author_art.append('Author information missing')
author_url.append('Author Url information missing')
can anyone take a look what i am doing wrong in this? As this code is not picking anything.
its is just returning blank list.
Full code:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
data = pd.read_csv('url_technology.csv')
author_art = []
author_url = []
for i in range(1):
try:
author_flag = 0
divs = soup.find_all('meta')
for div in divs:
author = div.find('span')
if(author is not None):
author_art.append(author.text)
author_url.append('https://medium.com'+author.get('href'))
aurhor_flag = 1
break
if(author_flag==0):
author_art.append('Author information missing')
author_url.append('Author Url information missing')
except:
print('no data found')
author_art = pd.DataFrame(title)
author_url = pd.DataFrame(url)
res = pd.concat([author_art, author_art] , axis=1)
res.columns = ['Author_Art', 'Author_url']
res.to_csv('combined1.csv')
print('File created successfully')
https://medium.com/javascript-scene/top-javascript-frameworks-and-topics-to-learn-in-2019-b4142f38df20?source=tag_archive---------0-----------------------
https://medium.com/job-advice-for-software-engineers/what-i-want-and-dont-want-to-see-on-your-software-engineering-resume-cbc07913f7f6?source=tag_archive---------1-----------------------
https://itnext.io/load-testing-using-apache-jmeter-af189dd6f805?source=tag_archive---------2-----------------------
https://medium.com/s/story/black-mirror-bandersnatch-a-study-guide-c46dfe9156d?source=tag_archive---------3-----------------------
https://medium.com/fast-company/the-worst-design-crimes-of-2018-56f32b027bb7?source=tag_archive---------4-----------------------
https://towardsdatascience.com/make-your-pictures-beautiful-with-a-touch-of-machine-learning-magic-31672daa3032?source=tag_archive---------5-----------------------
https://medium.com/hackernoon/the-state-of-ruby-2019-is-it-dying-509160a4fb92?source=tag_archive---------6-----------------------
One possibility how to get author Name and author URL is to parse the Ld+Json data embedded within the page:
import json
import requests
from bs4 import BeautifulSoup
url = "https://medium.com/javascript-scene/top-javascript-frameworks-and-topics-to-learn-in-2019-b4142f38df20"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
data = json.loads(soup.select_one('[type="application/ld+json"]').contents[0])
# uncomment this to print all LD+JSON data:
# print(json.dumps(data, indent=4))
print("Author:", data["author"]["name"])
print("URL:", data["author"]["url"])
Prints:
Author: Eric Elliott
URL: https://medium.com/#_ericelliott
EDIT: A function that returns Author Name/URL:
import json
import requests
from bs4 import BeautifulSoup
def get_author_name_url(medium_url):
soup = BeautifulSoup(requests.get(url).content, "html.parser")
data = json.loads(
soup.select_one('[type="application/ld+json"]').contents[0]
)
return data["author"]["name"], data["author"]["url"]
url_list = [
"https://medium.com/javascript-scene/top-javascript-frameworks-and-topics-to-learn-in-2019-b4142f38df20",
]
for url in url_list:
name, url = get_author_name_url(url)
print("Author:", name)
print("URL:", url)
I've launched a python package called medium-apis to do such tasks.
Install medium-apis
pip install medium-apis
Get you RapidAPI key. See how
Run the code:
from medium_apis import Medium
medium = Medium('YOUR_RAPIDAPI_KEY')
def get_author(url):
url_without_parameters = url.split('?')[0]
article_id = url_without_parameters.split('-')[-1]
article = medium.article(article_id=article_id)
author = article.author
author.save_info()
return author
urls = [
"https://nishu-jain.medium.com/medium-apis-documentation-3384e2d08667",
]
for url in urls:
author = get_author(url)
print('Author: ', author.fullname)
print('Profile URL: ', f'https://medium.com/#{author.username}')
Github repo: https://github.com/weeping-angel/medium-apis

Scraping results displayed across multiple pages from a `.aspx` site

I'm trying to scrape this site:
website address
If I manually search for A, I see the results spread across multiple pages but when I try to fetch the results using my script below, I get the results from the first page repeatedly:
I've tried with:
import requests
from bs4 import BeautifulSoup
url = 'http://www.occeweb.com/MOEAsearch/index.aspx'
session = requests.Session()
r = session.get(url)
soup = BeautifulSoup(r.text,'lxml')
for page in range(1,3):
payload = {i['name']:i.get('value','') for i in soup.select('input[name]')}
payload['txtSearch'] = 'A'
payload['__EVENTTARGET'] = 'gvResults'
payload['__EVENTARGUMENT'] = f'Page${page}'
res = session.post(url,data=payload)
soup = BeautifulSoup(res.text,"lxml")
for items in soup.select("#gvResults tr")[1:2]:
data = [item.get_text(strip=True) for item in items.select("td")]
print(data)
How can I get the results from other pages as well?
Your problem happens at below line
payload = {i['name']: i.get('value', '') for i in soup.select('input[name]')}
What happens when your are doing the 2nd page search, it sends an extra payload of btnSearch, which cause it to become a search operation instead of a next page operation
Well the fix is quite simple, below is the updated code
import requests
from bs4 import BeautifulSoup
url = 'http://www.occeweb.com/MOEAsearch/index.aspx'
session = requests.Session()
r = session.get(url)
soup = BeautifulSoup(r.text,'lxml')
for page in range(1,3):
payload = {i['name']:i.get('value','') for i in soup.select('input[name]')}
payload['txtSearch'] = 'A'
payload['__EVENTTARGET'] = 'gvResults'
payload['__EVENTARGUMENT'] = f'Page${page}'
if page > 1:
payload.pop('btnSearch')
res = session.post(url,data=payload)
soup = BeautifulSoup(res.text,"lxml")
for items in soup.select("#gvResults tr")[1:2]:
data = [item.get_text(strip=True) for item in items.select("td")]
print(data)

Google news crawler flip pages

continuing on previous work to crawl all news result about query and to return title and url, I am refining the crawler to get all results from all pages in Google News. Current code seems can only return the 1st page Googel news search result. Would be grateful to know how to get all pages results. Many thanks!
my codes below:
import requests
from bs4 import BeautifulSoup
import time
import datetime
from random import randint
import numpy as np
import pandas as pd
query2Google = input("What do you want from Google News?\n")
def QGN(query2Google):
s = '"'+query2Google+'"' #Keywords for query
s = s.replace(" ","+")
date = str(datetime.datetime.now().date()) #timestamp
filename =query2Google+"_"+date+"_"+'SearchNews.csv' #csv filename
f = open(filename,"wb")
url = "http://www.google.com.sg/search?q="+s+"&tbm=nws&tbs=qdr:y" # URL for query of news results within one year and sort by date
#htmlpage = urllib2.urlopen(url).read()
time.sleep(randint(0, 2))#waiting
htmlpage = requests.get(url)
print("Status code: "+ str(htmlpage.status_code))
soup = BeautifulSoup(htmlpage.text,'lxml')
df = []
for result_table in soup.findAll("div", {"class": "g"}):
a_click = result_table.find("a")
#print ("-----Title----\n" + str(a_click.renderContents()))#Title
#print ("----URL----\n" + str(a_click.get("href"))) #URL
#print ("----Brief----\n" + str(result_table.find("div", {"class": "st"}).renderContents()))#Brief
#print ("Done")
df=np.append(df,[str(a_click.renderContents()).strip("b'"),str(a_click.get("href")).strip('/url?q='),str(result_table.find("div", {"class": "st"}).renderContents()).strip("b'")])
df = np.reshape(df,(-1,3))
df1 = pd.DataFrame(df,columns=['Title','URL','Brief'])
print("Search Crawl Done!")
df1.to_csv(filename, index=False,encoding='utf-8')
f.close()
return
QGN(query2Google)
There used to be an ajax api, but it's no longer avaliable .
Still , you can modify your script with a for loop if you want to get a number of pages , or a while loop if you want to get all pages .
Example :
url = "http://www.google.com.sg/search?q="+s+"&tbm=nws&tbs=qdr:y&start="
pages = 10 # the number of pages you want to crawl #
for next in range(0, pages*10, 10) :
page = url + str(next)
time.sleep(randint(1, 5)) # you may need longer than that #
htmlpage = requests.get(page) # you should add User-Agent and Referer #
print("Status code: " + str(htmlpage.status_code))
if htmlpage.status_code != 200 :
break # something went wrong #
soup = BeautifulSoup(htmlpage.text, 'lxml')
... process response here ...
next_page = soup.find('td', { 'class':'b', 'style':'text-align:left' })
if next_page is None or next_page.a is None :
break # there are no more pages #
Keep in mind that google doesn't like bots , you might get a ban .
You could add 'User-Agent' and 'Referer' in headers to simulate a web browser , and use time.sleep(random.uniform(2, 6)) to simulate a human ... or use selenium.
You can also add &num=25 to the end of your query and you'll get back a webpage with that number of results. In this example youll get back 25 google results back.

Making request to multiple links using urllib.request

I'm trying to make a request to multiple links from Yahoo Finance then return array of Income Statement, Balance Sheet and Cash Flow, respectively. I found myself stuck in a lot of for loops. I'm wondering if there a better version to this code below:
def scrapper(symbol):
htmls = []
soup = []
gen_table = []
IS = "http://finance.yahoo.com/q/is?s={}+Income+Statement&annual".format(symbol)
BS = "http://finance.yahoo.com/q/is?s={}+Balance+Sheet&annual".format(symbol)
CF = "http://finance.yahoo.com/q/is?s={}+Cash+Flow&annual".format(symbol)
urls = [IS, BS, CF]
# read each link in urls
for url in urls:
with urllib.request.urlopen(url) as response:
htmls.append(response.read())
# parse data with BeautifulSoup
for html in htmls:
soup.append(BeautifulSoup(html))
# store income statement, balance sheet and cash flow into soup
for s in soup:
gen_table.append(s.find_all("table", class_="yfnc_tabledata1"))
return gen_table
I might do it this way:
from bs4 import BeautifulSoup
import urllib
def fetch_table(symbol, table):
url = "http://finance.yahoo.com/q/is?s={}+{}&annual".format(symbol, table)
with urllib.request.urlopen(url) as response:
result = response.read()
result = BeautifulSoup(result)
result = result.find_all("table", class_="yfnc_tabledata1")
return result
def scrapper(symbol):
return [fetch_table(symbol, table)
for table in (
"Income+Statement",
"Balance+Sheet",
"Cash+Flow")]
print (scrapper("X"))

Categories