how to pass search key and get result through bs4 - python

def get_main_page_url("https://malwr.com/analysis/search/", strDestPath, strMD5):
base_url = 'https://malwr.com/'
url = 'https://malwr.com/account/login/'
username = 'myname'
password = 'pswd'
session = requests.Session()
# getting csrf value
response = session.get(url)
soup = bs4.BeautifulSoup(response.content)
form = soup.form
csrf = form.find('input', attrs={'name': 'csrfmiddlewaretoken'}).get('value')
## csrf1 = form.find('input', attrs ={'name': 'search'}).get('value')
# logging in
data = {
'username': username,
'password': password,
'csrfmiddlewaretoken': csrf
}
session.post(url, data=data)
# getting analysis data
response = session.get(urlparameter)
soup = bs4.BeautifulSoup(response.content)
form = soup.form
csrf = form.find('input', attrs={'name': 'csrfmiddlewaretoken'}).get('value')
## csrf1 = form.find('input', attrs ={'name': 'search'}).get('value')
data = {
'search': strMD5,
'csrfmiddlewaretoken': csrf
}
session.post(urlparameter, data = data)
response = session.get(urlparameter)
soup = bs4.BeautifulSoup(response.content)
print(soup)
if(None != soup.find('section', id='file').find('table')('tr')[-1].a):
link = soup.find('section', id='file').find('table')('tr')[-1].a.get('href')
link = urljoin(base_url, link)
webFile = session.get(link)
filename =link.split('/')[-2]
filename = arg + filename
localFile = open(filename, 'wb')
localFile.write(webFile.content)
webFile.close()
localFile.close()
I am able to login by searching crftoken. Then I am trying to send MD5 to search on malware.com, however I am not able to get the page that searches the sent MD5 to page.
I want to search the MD5 that we passes through crftoken.
Please let me know what is the wrong in code.

You've done almost everything correctly. Except that you need to pass the result of the POST request to BeautifulSoup. Replace:
session.post(urlparameter, data = data)
response = session.get(urlparameter)
with:
response = session.post(urlparameter, data=data)
Worked for me (I had an account at malwr).

Related

trying to make an Yelp API call with a list of business IDs

When I run the code it's giving me this syntax error:
requests.exceptions.MissingSchema: Invalid URL 'h': No scheme supplied. Perhaps you meant http://h?
Here is the code I am working with:
from yelp_api_key import YELP_KEY
from yelp_api_location import loc_ids
MY_API_KEY = YELP_KEY
BUSINESS_PATH = f'https://api.yelp.com/v3/businesses/{loc_ids}/reviews'
HEADERS = {'Authorization': 'bearer %s' % MY_API_KEY}
PARAMETERS = {'locale': 'en_US'
}
for links in BUSINESS_PATH:
response = requests.get (url=links,
params=PARAMETERS,
headers=HEADERS)
business_data = response.json()
data = business_data['reviews']
print(data)
for x in data:
quotes = (x['text'])
print(quotes)
Below is the code that is working for me. I just want to be able to call multiple APIs without having to list the endpoints every time. Any suggestions would be great, TIA!
MY_API_KEY = YELP_KEY
BUSINESS_PATH = [f'https://api.yelp.com/v3/businesses/eL4d1tHv1mFoepoS_3rGbw/reviews',
f'https://api.yelp.com/v3/businesses/RzS-wNTycqB5WA34JfgW0g/reviews',
f'https://api.yelp.com/v3/businesses/PyV1e_OebaWm1cGUwtDvHA/reviews',
f'https://api.yelp.com/v3/businesses/dcbALMl6oyv_fdJ6dZGxzA/reviews',
f'https://api.yelp.com/v3/businesses/4uRA53NIl82a3QeZX-PcRw/reviews']
HEADERS = {'Authorization': 'bearer %s' % MY_API_KEY}
PARAMETERS = {'locale': 'en_US'
}
reviews = []
for links in BUSINESS_PATH:
# file_name = uuid.uuid1 ()
response = requests.get (url=links,
params=PARAMETERS,
headers=HEADERS)
business_data = response.json()
data = business_data['reviews']
for x in data:
quotes = (x['text'])
# print(quotes)
reviews.append(quotes)

Urllib2 grab web-page element then reverse it

I need to visit http://www.chiquitooenterprise.com/ reverse the string and access the website using this URL: http://www.chiquitooenterprise.com/password?code=REVERSEDSTRING
How can i do this using urllib2 and Python?
link = "http://www.chiquitooenterprise.com/password"
request = urllib2.Request("http://www.chiquitooenterprise.com/password")
contents = urllib2.urlopen(request).read()
revString = request[::-1]
answer = "http://www.chiquitooenterprise.com/password?code=" + revString
response = urllib2.urlopen(answer)
response = response.read()
print(response)```
link = "http://www.chiquitooenterprise.com/password"
result = requests.get("http://www.chiquitooenterprise.com/password")
contents = result.text
revString = contents[::-1]
answer = f"http://www.chiquitooenterprise.com/password?code={revString}"
response = requests.get(answer)
response = response.text
print(response)

Authentication results in 404 code

There is a website I need to scrape, but before I do I need to login.
There seems to be three things I need to get in, the username, password and authenticity token. The user name and password I know, but I am not sure how to access the token.
This is what I have tried:
import requests
from lxml import html
login_url = "https://urs.earthdata.nasa.gov/home"
session_requests = requests.session()
result = session_requests.get(login_url)
tree = html.fromstring(result.text)
authenticity_token = list(set(tree.xpath("//input[#name='authenticity_token']/#value")))[0]
payload = {"username": "my_name",
"password": "my_password",
"authenticity_token": authenticity_token}
result = session_requests.post(
login_url,
data = payload,
headers = dict(referer=login_url)
)
print (result)
This results in :
<Response [404]>
My name and password are entered correctly so it is the token that must be going wrong. I think the problem is this line:
authenticity_token = list(set(tree.xpath("//input[#name='authenticity_token']/#value")))[0]
or this line:
payload = {"username": "my_name",
"password": "my_password",
"authenticity_token": authenticity_token}
by looking at the source code on the webpage I noticed there is a authenticity_token, csrf-token and a csrf-param. So its possible these are in the wrong order, but I tried all the combinations.
EDIT:
Here is a beautiful soup approach that results in 404 again.
s = requests.session()
response = s.get(login_url)
soup = BeautifulSoup(response.text, "lxml")
for n in soup('input'):
if n['name'] == 'authenticity_token':
token = n['value']
if n['name'] == 'utf8':
utf8 = n['value']
break
auth = {
'username': 'my_username'
, 'password': 'my_password'
, 'authenticity_token': token
, 'utf8' : utf8
}
s.post(login_url, data=auth)
If you inspect the page you'll notice that form action value is '/login', so you have to submit your data to https://urs.earthdata.nasa.gov/login'.
login_url = "https://urs.earthdata.nasa.gov/login"
home_url = "https://urs.earthdata.nasa.gov/home"
s = requests.session()
soup = BeautifulSoup(s.get(home_url).text, "lxml")
data = {i['name']:i.get('value', '') for i in soup.find_all('input')}
data['username'] = 'my_username'
data['password'] = 'my_password'
result = s.post(login_url, data=data)
print(result)
< Response [200]>
A quick example with selenium:
from selenium import webdriver
driver = webdriver.Firefox()
url = 'https://n5eil01u.ecs.nsidc.org/MOST/MOD10A1.006/'
driver.get(url)
driver.find_element_by_name('username').send_keys('my_username')
driver.find_element_by_name('password').send_keys('my_password')
driver.find_element_by_id('login').submit()
html = driver.page_source
driver.quit()

I am trying to scrape HTML from a site that requires a login but am not getting any data

I am following this tutorial but I can't seem to get any data when I am running the python. I get an HTTP status code of 200 and status.ok returns a true value. Any help would be great. This is what my response looks like in Terminal:
[]
200
True
import requests
from lxml import html
USERNAME = "username#email.com"
PASSWORD = "legitpassword"
LOGIN_URL = "https://bitbucket.org/account/signin/?next=/"
URL = "https://bitbucket.org/dashboard/overview"
def main():
session_requests = requests.session()
# Get login csrf token
result = session_requests.get(LOGIN_URL)
tree = html.fromstring(result.text)
authenticity_token = list(set(tree.xpath("//input[#name='csrfmiddlewaretoken']/#value")))[0]
# Create payload
payload = {
"username": USERNAME,
"password": PASSWORD,
"csrfmiddlewaretoken": authenticity_token
}
# Perform login
result = session_requests.post(LOGIN_URL, data = payload, headers = dict(referer = LOGIN_URL))
# Scrape url
result = session_requests.get(URL, headers = dict(referer = URL))
tree = html.fromstring(result.content)
bucket_elems = tree.findall(".//span[#class='repo-name']")
bucket_names = [bucket_elem.text_content().replace("\n", "").strip() for bucket_elem in bucket_elems]
print bucket_names
print result.status_code
if __name__ == '__main__':
main()
The xpath is wrong, there is no span with the class repo-name, you can get the repo names from the anchor tags with:
bucket_elems = tree.xpath("//a[#class='execute repo-list--repo-name']")
bucket_names = [bucket_elem.text_content().strip() for bucket_elem in bucket_elems]
The html has obviously changed since the tutorial was written.

Unable to create Wiki page using python requests

I am trying to edit a page on a Wiki that uses MediaWiki software, but it is not working. I am able to succesfully log in, but unable to edit pages. I am unsure what is causing this problem, as I included the edit token in my request. Here is my code:
import requests
from bs4 import BeautifulSoup as bs
def get_login_token(raw_resp):
soup = bs(raw_resp.text, 'lxml')
token = [n.get('value', '') for n in soup.find_all('input')
if n.get('name', '') == 'wpLoginToken']
return token[0]
def get_edit_token(raw_resp):
soup = bs(raw_resp.text, 'lxml')
token = [n.get('value', '') for n in soup.find_all('input')
if n.get('name', '') == 'wpEditToken']
return token[0]
#login
s = requests.Session()
values = {'wpName' : 'username',
'wpPassword' : 'password',
'wpLoginAttempt' : 'Log in',
'wpForceHttps' : '1',
'wpLoginToken' : ''
}
url = '.....'
resp = s.get(url)
values['wpLoginToken'] = get_login_token(resp)
req = s.post(url, values)
# edit page
url1 = '.....'
editing = {'wpTextbox1' : 'hi there',
'wpSave' : 'Save page',
'wpSummary' : 'hi',
'wpEditToken' : ''}
resp = s.get(url1)
editing['wpEditToken'] = get_edit_token(resp)
edit = s.post(url1, editing)
print(edit.url)
print(edit.content)

Categories