I am using 3 modules in this program, I don't know if what I'm trying to do is even possible! So I want to scrape some data off of twitter and write it in a text file using python, can somebody please guide me and tell me why my code isn't writing the data scrapped?
import urllib
import urllib.request
from os import path
from bs4 import BeautifulSoup
# here I define the url, I request the page, create my soup
theurl = "https://twitter.com/realDonaldTrump"
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, "html.parser")
def create_file(dest):
"""
Creates a file for the user to write data in!
:param dest:
:return:
"""
## FileName == Month_Day_Year
name = 'Data Scraped.txt'
if not(path.isfile(dest +name)):
f = open(dest + name, "w")
f.write(soup.title.text)
f.close()
if __name__ == '__main__':
destination = 'C:\\Users\\edwin\\' \
'Desktop\\WebScrappin\\'
create_file(destination)
print("Your file has been created!!")
You're only the writing the title of the document that you received.
f.write(soup.title.text)
Instead of scraping (which is against their ToS) you should gather your data from their RESTful API or use a library like Twython
Related
I'm working on a web scraping project in Python and trying to add automated testing w/ Pytest. I'm not new to web scraping but I'm very new to testing, and I believe the idea here is I should mock the HTTP request and replacing it with some dummy html fixture code to test if the rest of the function works without having to rely on requesting anything from the actual url.
Below is my web scraping function.
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
def get_player_stats_data():
"""
Web Scrape function w/ BS4 that grabs aggregate season stats
Args:
None
Returns:
Pandas DataFrame of Player Aggregate Season stats
"""
try:
year_stats = 2022
url = f"https://www.basketball-reference.com/leagues/NBA_{year_stats}_per_game.html"
html = urlopen(url)
soup = BeautifulSoup(html, "html.parser")
headers = [th.getText() for th in soup.findAll("tr", limit=2)[0].findAll("th")]
headers = headers[1:]
rows = soup.findAll("tr")[1:]
player_stats = [
[td.getText() for td in rows[i].findAll("td")] for i in range(len(rows))
]
stats = pd.DataFrame(player_stats, columns=headers)
print(
f"General Stats Extraction Function Successful, retrieving {len(stats)} updated rows"
)
return stats
except BaseException as error:
print(f"General Stats Extraction Function Failed, {error}")
df = []
return df
And here is what I'm using to grab the raw html of the page, and pickling it so I can save it and import it for testing.
import pickle
from bs4 import BeautifulSoup
from urllib.request import urlopen
year_stats = 2022
url = "https://www.basketball-reference.com/leagues/NBA_2022_per_game.html"
html = urlopen(url)
# how you save it
with open('new_test/tests/fixture_csvs/stats_html.html', 'wb') as fp:
while True:
chunk = html.read(1024)
if not chunk:
break
fp.write(chunk)
# how you open it
with open('new_test/tests/fixture_csvs/stats_html.html', "rb") as fp:
stats_html = fp.read()
My question is how do I mock/patch/monkeypatch the urlopen(url) call and use the pickled html in its place to create a fixture with it? The Pytest docs example is creating a class & monkeypatching requests.get() where get is an attribute of requests which seems a little different from what i'm doing, and I haven't been able to get mine working, I think i'm supposed to use something other than monkeypatch.setattr? Below is what I tried.
#pytest.fixture(scope="session")
def player_stats_data_raw(monkeypatch):
"""
Fixture to load web scrape html from an html file for testing.
"""
fname = os.path.join(
os.path.dirname(__file__), "fixture_csvs/stats_html.html"
)
with open(fname, "rb") as fp:
html = fp.read()
def mock_urlopen():
return html
monkeypatch.setattr(urlopen, "url", mock_urlopen)
df = get_player_stats_data()
return df
### The actual tests in a separate file
def test_raw_stats_rows(player_stats_data_raw):
assert len(player_stats_data_raw) == 30
def test_raw_stats_schema(player_stats_data_raw):
assert list(player_stats_data_raw.columns) == raw_stats_cols
The goal is to replace html = urlopen(url) in the web scraping function with this pickled html I've previously saved.
The other option is to turn that url into an input parameter for the function, where in production I just call the actual url as you see here (www.basketballreference.com/etc), and in testing I just read in that pickled value. That's an option but I'm curious to learn & apply this patching technique to a real example. If anyone has any thoughts I'd appreciate it!
In your test file, you could try like this:
from module.script import get_player_stats_data
#pytest.fixture(scope="session")
def urlopen(mocker):
with open(fname, "rb") as fp:
html = fp.read()
urlopen = mocker.patch("module.script.urlopen")
urlopen.return_value = html
return urlopen
def test_raw_stats_rows(urlopen):
df = get_player_stats_data()
assert len(df) == 30
def test_raw_stats_schema(urlopen):
df = get_player_stats_data()
assert list(df.columns) == raw_stats_cols
So I made a Python program that reads my csv file of gene access numbers, and I tried to make requests pull up 335 URLs based on each gene access number, but I got an:
InvalidSchema: No connection adapters were found for '[the urls...]'
My code was:
import urllib.request as urllib
from bs4 import BeautifulSoup
def fresh_soup(url):
'''
Collects and parses the page source from a given url, returns the parsed page source
- url : the url you wish to scrape
'''
hdr = {'User-Agent': 'Mozilla/5.0'}
req = urllib.Request(url,headers=hdr)
source = urllib.urlopen(req,timeout=10).read()
soup = BeautifulSoup(source,"lxml")
return soup
###
import csv
result = []
for line in open("C:/Projects/NCBI Scraper project/geneAccNumbers.txt"):
result.append(line.split('/t'))
csv = open("C:/Projects/NCBI Scraper project/geneAccNumbers.txt", 'r')
for gene in csv.readline().split('/t'):
url = 'https://www.ncbi.nlm.nih.gov/nuccore/' + gene + '.1?report=fasta'
def build_url(gene):
return 'https://www.ncbi.nlm.nih.gov/nuccore/' + gene + '.1?report=fasta'
genes_urls = [build_url(gene) for gene in csv]
print(genes_urls)
import requests
r = requests.get(genes_urls)
Is there something I can do to make it correctly request every url?
On a side note: I think some of the URLs generated have backforwards and forwards slashes in their name, but when I manually copy it into a browser, it responds like it wasn't even a problem and still gets me the page(s) I want. Should I still try to make it all one type of slash?
I need to download a file from an external source, I am using Basic authentication to login to the URL
import requests
response = requests.get('<external url', auth=('<username>', '<password>'))
data = response.json()
html = data['list'][0]['attachments'][0]['url']
print (html)
data = requests.get('<API URL to download the attachment>', auth=('<username>', '<password>'), stream=True)
print (data.content)
I am getting below output
<url to download the binary data>
\x00\x00\x13\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\xcb\x00\x00\x1e\x00\x1e\x00\xbe\x07\x00\x00.\xcf\x05\x00\x00\x00'
I am expecting the URL to download the word document within the same session.
Working solution
import requests
import shutil
response = requests.get('<url>', auth=('<username>', '<password>'))
data = response.json()
html = data['list'][0]['attachments'][0]['url']
print (html)
data = requests.get('<url>', auth=('<username>', '<password>'), stream=True)
with open("C:/myfile.docx", 'wb') as f:
data.raw.decode_content = True
shutil.copyfileobj(data.raw, f)
I am able to download the file as it is.
When you want to download a file directly you can use shutil.copyfileobj():
https://docs.python.org/2/library/shutil.html#shutil.copyfileobj
You already are passing stream=True to requests which is what you need to get a file-like object back. Just pass that as the source to copyfileobj().
I am trying to use urllib to access a website and then strip the page source so I can collect some data from it. I know how to do this for public websites but I don't know how to use urllib to do this for password protected webpages. I know the username and password, I am just very confused about how to get urllib to put in the correct credentials then reroute me to the correct page that I want to strip the data from. Currently, my code looks like this. The problem is that it is bringing up the login page's source.
from tkinter import *
import csv
from re import findall
import urllib.request
def info():
file = filedialog.askopenfilename()
fileR = open(file, 'r')
hold = csv.reader(fileR, delimiter=',', quotechar='|')
aList=[]
for item in hold:
if item[1] and item[2] == "":
print(item[1])
url = "www.example.com/id=" + item[1]
request = urllib.request.urlopen(url)
html = request.read()
data = str(html)
person = findall('''\$MainContent\$txtRecipient\"\stype=\"text\"\svalue=\"([^\"]+)\"''',data)
else:
pass
fileR.close
Remember, I am using python 3.3.3. Any help would be appreciated!
I'm trying to figure out how to go about writing a website monitoring script (cron job in the end) to open up a given URL, check to see if a tag exists, and if the tag does not exist, or doesn't contain the expected data, then to write some to a log file, or to send an e-mail.
The tag would be something like or something relatively similar.
Anyone have any ideas?
Your best bet imo is to check out BeautifulSoup. Something like so:
import urllib2
from BeautifulSoup import BeautifulSoup
page = urllib2.urlopen("http://yoursite.com")
soup = BeautifulSoup(page)
# See the docs on how to search through the soup. I'm not sure what
# you're looking for so my example stops here :)
After that, emailing it or logging it is pretty standard fare.
This is a sample code (untested) that log and send mail:
#!/usr/bin/env python
import logging
import urllib2
import smtplib
#Log config
logging.basicConfig(filename='/tmp/yourscript.log',level=logging.INFO,)
#Open requested url
url = "http://yoursite.com/tags/yourTag"
data = urllib2.urlopen(url)
if check_content(data):
#Report to log
logging.info('Content found')
else:
#Send mail
send_mail('Content not found')
def check_content(data):
#Your BeautifulSoup logic here
return content_found
def send_mail(message_body):
server = 'localhost'
recipients = ['you#yourdomain.com']
sender = 'script#yourdomain.com'
message = 'From: %s \n Subject: script result \n\n %s' % (sender, message_body)
session = smtplib.SMTP(server)
session.sendmail(sender,recipients,message);
I would code check_content() function using beautifulSoup
The following (untested) code uses urllib2 to grab the page and re to search it.
import urllib2,StringIO
pageString = urllib2.urlopen('**insert url here**').read()
m = re.search(r'**insert regex for the tag you want to find here**',pageString)
if m == None:
#take action for NOT found here
else:
#take action for found here
The following (untested) code uses pycurl and StringIO to grab the page and re to search it.
import pycurl,re,StringIO
b = StringIO.StringIO()
c = pycurl.Curl()
c.setopt(pycurl.URL, '**insert url here**')
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.perform()
c.close()
m = re.search(r'**insert regex for the tag you want to find here**',b.getvalue())
if m == None:
#take action for NOT found here
else:
#take action for found here