I am using PyCharm to capture some data from web and push it into in-memory database-table on SQLite. I have debugged the code, it works fine, in the debugger I can see data being fetched, it being pushed into db[table] location.
Python code is as below -
import requests
import dataset
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
def begin():
db = dataset.connect('sqlite:///quotes.db')
authors_seen = set()
base_url = 'http://quotes.toscrape.com/'
def clean_url(url):
# Clean '/author/Steve-Martin' to 'Steve-Martin'
# Use urljoin to make an absolute URL
url = urljoin(base_url, url)
# Use urlparse to get out the path part
path = urlparse(url).path
# Now split the path by '/' and get the second part
# E.g. '/author/Steve-Martinvisual studio' -> ['','author', 'Steve-Martin']
return path.split('/')[2]
def scrape_quotes(html_soup):
for quote in html_soup.select('div.quote'):
quote_text = quote.find(class_='text').get_text(strip=True)
quote_author_url = clean_url(quote.find(class_='author').find_next_sibling('a').get('href'))
quote_tag_urls = [clean_url(a.get('href')) for a in quote.find_all('a', class_='tag')]
authors_seen.add(quote_author_url)
# Store this quote and its tags
quote_id = db['quotes'].insert({'text' : quote_text, 'author' : quote_author_url})
db['quotes_tags'].insert_many([{'quote_id' : quote_id, 'tag_id' : tag} for tag in quote_tag_urls])
def scrape_author(html_soup, author_id):
author_name = html_soup.find(class_='author-title').get_text(strip=True)
author_born_date = html_soup.find(class_='author-born-date').get_text(strip=True)
author_born_loc = html_soup.find(class_='author-born-location').get_text(strip=True)
author_desc = html_soup.find(class_='author-description').get_text(strip=True)
db['authors'].insert({'author_id': author_id, 'name': author_name,
'born_date': author_born_date, 'born_location': author_born_loc,
'description': author_desc})
# Start by scraping all the quote pages
print('*****Beginning scraping process - quotes first.*****')
url = base_url
while True:
print('Now scraping page:', url)
r = requests.get(url)
html_soup = BeautifulSoup(r.text, 'html.parser')
# Scrape the quotes
scrape_quotes(html_soup)
# Is there a next page?
next_a = html_soup.select('li.next > a')
if not next_a or not next_a[0].get('href'):
break
url = urljoin(url, next_a[0].get('href'))
# Now fetch out the author information
print('*****Scraping authors data.*****')
for author_id in authors_seen:
url = urljoin(base_url, '/author/' + author_id)
print('Now scraping author:', url)
r = requests.get(url)
html_soup = BeautifulSoup(r.text, 'html.parser')
# Scrape the author information
scrape_author(html_soup, author_id)
db.commit()
db.close()
What I am struggling with is the pycharm IDE connection. As shown in the figure below, I can see quotes.sqlite database. It has only one table listed - sqlite_master. Under server objects there are collations, modules and routines, which is part of infrastructure provided by SQLite.
Also, when I view the db object (python's driver to SQLite) in debugger, I can see the relevant table as shown in the picture below -
Any ideas why PyCharm refuses to show relevant table/collection in the IDE?
Related
I've been trying to scrape Bandcamp fan pages to get a list of the albums they have purchased and I'm having trouble efficiently doing it. I wrote something with Selenium but it's mildly slow so I'd like to learn a solution that'd maybe send a POST request to the site and parse the JSON from there.
Here's a sample collection page: https://bandcamp.com/nhoward
Here's the Selenium code:
def scrapeFanCollection(url):
browser = getBrowser()
setattr(threadLocal, 'browser', browser)
#Go to url
browser.get(url)
try:
#Click show more button
browser.find_element_by_class_name('show-more').click()
#Wait two seconds
time.sleep(2)
#Scroll to the bottom loading full collection
scroll(browser, 2)
except Exception:
pass
#Return full album collection
soup_a = BeautifulSoup(browser.page_source, 'lxml', parse_only=SoupStrainer('a', {"class": "item-link"}))
#Empty array
urls = []
# Looping through all the a elements in the page source
for item in soup_a.find_all('a', {"class": "item-link"}):
url = item.get('href')
if(url != None):
urls.append(url)
return urls
The API can be accessed as follows:
$ curl -X POST -H "Content-Type: Application/JSON" -d \
'{"fan_id":82985,"older_than_token":"1586531374:1498564527:a::","count":10000}' \
https://bandcamp.com/api/fancollection/1/collection_items
I didn't encounter a scenario where a "older_than_token" was stale, so the problem boils down to getting the "fan_id" given a URL.
This information is located in a blob in the id="pagedata" element.
>>> import json
>>> import requests
>>> from bs4 import BeautifulSoup
>>> res = requests.get("https://www.bandcamp.com/ggorlen")
>>> soup = BeautifulSoup(res.text, "lxml")
>>> user = json.loads(soup.find(id="pagedata")["data-blob"])
>>> user["fan_data"]["fan_id"]
82985
Putting it all together (building upon this answer):
import json
import requests
from bs4 import BeautifulSoup
fan_page_url = "https://www.bandcamp.com/ggorlen"
collection_items_url = "https://bandcamp.com/api/fancollection/1/collection_items"
res = requests.get(fan_page_url)
soup = BeautifulSoup(res.text, "lxml")
user = json.loads(soup.find(id="pagedata")["data-blob"])
data = {
"fan_id": user["fan_data"]["fan_id"],
"older_than_token": user["wishlist_data"]["last_token"],
"count": 10000,
}
res = requests.post(collection_items_url, json=data)
collection = res.json()
for item in collection["items"][:10]:
print(item["album_title"], item["item_url"])
I'm using user["wishlist_data"]["last_token"] which has the same format as the "older_than_token" just in case this matters.
In order to get the entire collection i changed the previous code from
"older_than_token": user["wishlist_data"]["last_token"]
to
user["collection_data"]["last_token"]
which contained the right token
Unfortunately for you, this particular Bandcamp site doesn't seem to make any HTTP API call to fetch the list of albums. You can check that by using your browser developer tools, Network tab, click on XHR filter. The only call being made seems to be fetching your collection details.
Currently doing web-scraping for the first time trying to grab and compile a list of completed Katas from my CodeWars profile. You can view the completed problems without being logged in but it does not display your solutions unless you have logged in to that specific account.
Here is an inspect preview of the page display when logged in and the relevant divs I'm trying to scrape:
The url for that page is https://www.codewars.com/users/User_Name/completed_solutions
with User_Name replaced by an actual username.
The log-in page is: https://www.codewars.com/users/sign_in
I have attempted to get the divs with the class "list-item solutions" in two different ways now which I'll write:
#attempt 1
import requests
from bs4 import BeautifulSoup
login_url = "https://www.codewars.com/users/sign_in"
end_url = "https://www.codewars.com/users/Ash-Ozen/completed_solutions"
with requests.session() as sesh:
result = sesh.get(login_url)
soup = BeautifulSoup(result.content, "html.parser")
token = soup.find("input", {"name": "authenticity_token"})["value"]
payload = {
"user[email]": "ph#gmail.com",
"user[password]": "phpass>",
"authenticity_token": str(token),
}
result = sesh.post(login_url, data=payload) #this logs me in?
page = sesh.get(end_url) #This navigates me to the target page?
soup = BeautifulSoup(page.content, "html.parser")
print(soup.prettify()) # some debugging
# Examining the print statement shows that the "list-item solutions" is not
# there. Checking page.url shows the correct url(https://www.codewars.com/users/Ash-Ozen/completed_solutions).
solutions = soup.findAll("div", class_="list-item solutions")
# solutions yields an empty list.
and
#attempt 2
from robobrowser import RoboBrowser
from bs4 import BeautifulSoup
browser = RoboBrowser(history=True)
browser.open("https://www.codewars.com/users/sign_in")
form = browser.get_form()
form["user[email]"].value = "phmail#gmail.com"
form["user[password]"].value = "phpass"
browser.submit_form(form) #think robobrowser handles the crfs token for me?
browser.open("https://www.codewars.com/users/Ash-Ozen/completed_solutions")
r = browser.parsed()
soup = BeautifulSoup(str(r[0]), "html.parser")
solutions = soup.find_all("div", class_="list-item solutions")
print(solutions) # returns empty list
No idea how/what to debug from here to get it working.
Edit: My initial thoughts about what is going wrong is that, after performing either post I get redirected to the dashboard (behavior after logging in successfully) but it seems that when trying to get the final url I end up with the non-logged-in version of the page.
I am trying to get scrape some data from stockrow.com using BeautifulSoup.
However there seems to be some diffrences between inspect and view sourcecode (im using chrome, but i do not see that being a problem for Pyton).
This is resulting in some trouble as the sourcecode itself does not show any html-tags such as h1. They are however showing up when i use the inspect tool.
The part i am trying to scrape (among other things) - this is show using the inspect tool:
<h1>Teva Pharmaceutical Industries Ltd<small>(TEVA)</small></h1>
My current code, printing an empty list:
import bs4 as bs
import urllib.request
class Stock:
stockrow_url = "https://stockrow.com"
url_suffix = "/financials/{}/annual"
def __init__(self, ticker : str, stock_url=stockrow_url, url_suffix = url_suffix):
# Stock ticker
self.ticker = ticker.upper()
# URLs for financial statements related to the ticker
self.stock_url = stock_url + "/{}".format(self.ticker)
sauce = urllib.request.urlopen(self.stock_url).read()
soup = bs.BeautifulSoup(sauce, 'html.parser').h1
print(soup)
self.income_url = self.stock_url + url_suffix.format("income")
self.balance_sheet_url = self.stock_url + url_suffix.format("balance")
self.cash_flow_url = self.stock_url + url_suffix.format("cashflow")
teva = Stock("teva")
print(teva.get_income_statement())
The page is dynamically generated using jscript and cannot be handled by beautifulsoup. You can capture the information using either selenium and the like, or by looking for API calls.
In this case, you can get for TEVA, background information using
import json
import requests
hdr = {'User-Agent':'Mozilla/5.0'}
url = "https://stockrow.com/api/companies/TEVA.json?ticker=TEVA"
response = requests.get(url, headers=hdr)
info = json.loads(response.text)
info
Similarly, the income statement is hiding here:
url = 'https://stockrow.com/api/companies/TEVA/financials.json?ticker=TEVA&dimension=MRY§ion=Income+Statement'
Using the same code as above but with this other url, will get you your income statement, in json format.
And you can take it from there. Search around - there is a lot of information available on this topic. Good luck.
I’m trying to scrape data from http://portal.uspto.gov/EmployeeSearch/ web site.
I open the site in browser, click on the Search button inside the Search by Organisation part of the site and look for the request being sent to server.
When I post the same request using python requests library in my program, I don’t get the result page which I am expecting but I get the same Search page, with no employee data on it.
I’ve tried all variants, nothing seems to work.
My question is, what URL should I use in my request, do I need to specify headers (tried also, copied headers viewed in Firefox developer tools upon request) or something else?
Below is the code that sends the request:
import requests
from bs4 import BeautifulSoup
def scrape_employees():
URL = 'http://portal.uspto.gov/EmployeeSearch/searchEm.do;jsessionid=98BC24BA630AA0AEB87F8109E2F95638.prod_portaljboss4_jvm1?action=displayResultPageByOrgShortNm¤tPage=1'
response = requests.post(URL)
site_data = response.content
soup = BeautifulSoup(site_data, "html.parser")
print(soup.prettify())
if __name__ == '__main__':
scrape_employees()
All the data you need is in a form tag:
action is the url when you make a post to server.
input is the data you need post to server. {name:value}
import requests, bs4, urllib.parse,re
def make_soup(url):
r = requests.get(url)
soup = bs4.BeautifulSoup(r.text, 'lxml')
return soup
def get_form(soup):
form = soup.find(name='form', action=re.compile(r'OrgShortNm'))
return form
def get_action(form, base_url):
action = form['action']
# action is reletive url, convert it to absolute url
abs_action = urllib.parse.urljoin(base_url, action)
return abs_action
def get_form_data(form, org_code):
data = {}
for inp in form('input'):
# if the value is None, we put the org_code to this field
data[inp['name']] = inp['value'] or org_code
return data
if __name__ == '__main__':
url = 'http://portal.uspto.gov/EmployeeSearch/'
soup = make_soup(url)
form = get_form(soup)
action = get_action(form, url)
data = get_form_data(form, '1634')
# make request to the action using data
r = requests.post(action, data=data)
I'm writing a web scraper. I could've just used scrapy but decided to write it from scratch so I can practice.
I've created a scraper that works successfully using requests and BeautifulSoup. It navigates through about 135 pages with 12 items on each, grabs the link and then grabs the information from the link destination. At the end it writes everything in a CSV file. It only grabs strings and it doesn't download any images or anything like that… for now.
Problem? It's quite slow. It takes about 5 secs to grab the everything just from contents of one page so that times 135 is about 11 minutes.
So my question is how do I implement threading in my code so it gets data way faster.
Here's the code:
import requests
from bs4 import BeautifulSoup
import re
import csv
def get_actor_dict_from_html(url, html):
soup = BeautifulSoup(html, "html.parser")
#There must be a better way to handle this, but let's assign a NULL value to all upcoming variables.
profileName = profileImage = profileHeight = profileWeight = 'NULL'
#Let's get the name and image..
profileName = str.strip(soup.find('h1').get_text())
profileImage = "http://images.host.com/actors/" + re.findall(r'\d+', url)[0] + "/actor-large.jpg"
#Now the rest of the stuff..
try:
profileHeight = soup.find('a', {"title": "Height"}).get_text()
except:
pass
try:
profileWeight = soup.find('a', {"title": "Weight"}).get_text()
except:
pass
return {
'Name': profileName,
'ImageUrl': profileImage,
'Height': profileHeight,
'Weight': profileWeight,
}
def lotta_downloads():
output = open("/tmp/export.csv", 'w', newline='')
wr = csv.DictWriter(output, ['Name','ImageUrl','Height','Weight'], delimiter=',')
wr.writeheader()
for i in range(135):
url = "http://www.host.com/actors/all-actors/name/{}/".format(i)
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, "html.parser")
links = soup.find_all("div", { "class" : "card-image" })
for a in links:
for url in a.find_all('a'):
url = "http://www.host.com" + url['href']
print(url)
response = requests.get(url)
html = response.content
actor_dict = get_actor_dict_from_html(url, html)
wr.writerow(actor_dict)
print('All Done!')
if __name__ == "__main__":
lotta_downloads()
Thanks!
Why don't you try to use gevent library?
gevent library has monkey patch making blocking function to non-blocking function.
Maybe wait time of requests is too much and so slow.
So I think that Making request as non-blocking function make your program fast.
On python 2.7.10
example:
import gevent
from gevent import monkey; monkey.patch_all() # Fix import code
import reqeusts
actor_dict_list = []
def worker(url):
content = requests.get(url).content
bs4.BeautifulSoup(content)
links = soup.find_all('div', {'class': 'card-image'})
for a in links:
for url in a.find_all('a'):
response = requests.get(url) # You can also use gevent spawn function on this line
...
actor_dict_list.append(get_actor_dict_from_html(url, html)) # Because of preventing race condition
output = open("/tmp/export.csv", "w", newline='')
wr = csv.DictWriter(output, ['Name', 'ImageUrl', 'Height', 'Weight'], delimiter=',')
wr.writeheader()
urls = ["http://www.host.com/actors/all-actors/name/{}/".format(i) for i in range(135)]
jobs = [gevent.spawn(worker, url) for url in urls]
gevent.joinall(jobs)
for i in actor_dict_list:
wr.writerow(actor_dict)
public gevent document: doc
P.S.
You must install python-gevent If you have ubuntu OS
sudo apt-get install python-gevent