I've been trying to make an autoreg script for a certain website (just to improve requests skills), the registration form is shown below
screenshot
It's in russian, but it's only asking you to input username, email, solve captcha from the image and solve an easy math equation.
Here is a CURL of the successful post request while sending form manually.
As you can see, successful request has 302 status code, but with my script status code is 200 and I can't find a reason for it. My code below:
def register(username = None, email = None, email_pass = None):
with HTMLSession() as session:
params ={"action": "register"}
r = session.get("https://gidonline.io/wp-login.php", params=params)
# with open("gidonlineForm.html", "w", encoding="utf-8") as f:
# f.write(r.text)
if int(r.status_code) != 200:
print("Something went wrong")
return None
r.html.render()
soup = BeautifulSoup(r.text, "lxml")
imageLink = soup.find("img", {"alt": "captcha image"})["src"]
print(imageLink)
imageBytes = session.get(imageLink).content
imageBase64 = base64.b64encode(imageBytes)
captchaKey = solve_captcha(imageBase64)
print(captchaKey)
question = soup.find("input", {"name": "math", "id": "math"}).find_parent("label").text
expString = question.split(":")[-1].strip()
expressionResult = solve_math(expString)
encodedStr = soup.find("p", {"id": "sabre_spectre"}).find("input")["name"]
sabre_js_check1 = soup.find("input", {"id": "sabre_js_check1"})["value"]
sabre_js_check2 = soup.find("input", {"id": "sabre_js_check2"})["value"]
sabre_id = soup.find("input", {"id": "sabre_id"})["value"]
sabre_js_payload = r.html.find('#sabre_js_payload', first=True).attrs['value']
params = {
'action': 'register',
}
data = {
'user_login': username,
'user_email': email,
'captcha': str(captchaKey),
'math': str(expressionResult),
encodedStr: "",
"sabre_js_check1": sabre_js_check1,
"sabre_js_check2": sabre_js_check2,
"sabre_js_payload": sabre_js_payload,
"sabre_id": sabre_id,
'wp-submit': 'Регистрация'
}
r = session.post('https://gidonline.io/wp-login.php', params=params, data=data, allow_redirects=True)
if int(r.status_code) == 200:
print("Ooops...200 status code")
print(r.html.find("#login_error", first=True).text)
return None
else:
print(f"Status code: {r.status_code}")
with open("success.html", "w", encoding='utf-8') as f:
f.write(r.text)
I can easily provide any additional info if needed, thank you all!
Related
I am building this scraper and I am trying to scrape TikTok hastags, then get the username from that hashtag, followed by scarping each username that I previously scraped. When finished want the information to be organized in a csv file. This is what I came up with but it is not working as I expected. I am a beginner and I am trying to learn a new language. What am I missing and doing wrong?
import requests
import json
import pandas as pd
# scrape hastag
url = "https://api.tikapi.io/public/hashtag?count=30&id=9261"
payload={}
headers = {
'X-API-KEY': 'xxxxxx'
}
response = requests.request("GET", url, headers=headers, data=payload)
hashtag_response = response.text
hashtag_json = json.loads (hashtag_response)
# write data to hashtag json file
results_json = open("data.json", "w")
L = [response.text]
results_json.writelines(L)
results_json.close()
# list
influencer = []
followerCount = []
bioLink = []
signature = []
for uniqueId in hashtag_json ['uniqueId']:
influencer.append(uniqueId)
# scrape influencer username
url = "https://api.tikapi.io/public/check?username={influencer}"
payload={}
headers = {
'X-API-KEY': 'xxxxx'
}
influencerresponse = requests.request("GET", url, headers=headers, data=payload)
infl_response = influencerresponse.text
influencer_json = json.loads (infl_response)
# write data to influencer json file
results_json = open("infl_data.json", "w")
I = [influencerresponse.text]
results_json.writelines(I)
results_json.close()
for followerCount, bioLink in influencer_json ['followerCount','bioLink','signature']:
followerCount.append(followerCount)
bioLink.append(bioLink)
signature.append(signature)
# create csv file of results
influencer_df = pd.DataFrame({
'Influencer' : influencer,
'Follower Count' : followerCount,
'Link' : bioLink,
'Signature' : signature,
})
influencer_df.to_csv('/Users/john/Repos/TikTok/influencer.csv', index=False)
you were wrong in this part
for uniqueId in hashtag_json ['uniqueId']:
influencer.append(uniqueId)
should be
influencer.append(hashtag_json["itemList"][0]['author']['uniqueId'])
and in this part
for followerCount, bioLink in influencer_json ['followerCount','bioLink','signature']:
should be
followerCount.append(influencer_json['userInfo']['stats']['followerCount'])
bioLink.append(influencer_json['userInfo']['user']['bioLink']['link'])
signature.append(influencer_json['userInfo']['user']['signature'])
.
import requests
import json
import pandas as pd
# scrape hastag
url = "https://api.tikapi.io/public/hashtag?count=30&id=9261"
payload={}
headers = {
'X-API-KEY': 'xxxx'
}
response = requests.request("GET", url, headers=headers, data=payload)
hashtag_response = response.text
hashtag_json = json.loads(hashtag_response)
# write data to hashtag json file
results_json = open("data.json", "w")
L = [response.text]
results_json.writelines(L)
results_json.close()
# list
influencer = []
followerCount = []
bioLink = []
signature = []
influencer.append(hashtag_json["itemList"][0]['author']['uniqueId'])
# scrape influencer username
url = "https://api.tikapi.io/public/check?username={}".format(influencer[0])
payload={}
headers = {
'X-API-KEY': 'xxxx'
}
influencerresponse = requests.request("GET", url, headers=headers, data=payload)
infl_response = influencerresponse.text
influencer_json = json.loads(infl_response)
# write data to influencer json file
results_json = open("infl_data.json", "w")
I = [influencerresponse.text]
results_json.writelines(I)
results_json.close()
followerCount.append(influencer_json['userInfo']['stats']['followerCount'])
bioLink.append(influencer_json['userInfo']['user']['bioLink']['link'])
signature.append(influencer_json['userInfo']['user']['signature'])
# create csv file of results
influencer_df = pd.DataFrame({
'Influencer' : influencer,
'Follower Count' : followerCount,
'Link' : bioLink,
'Signature' : signature,
})
influencer_df.to_csv('/Users/john/Repos/TikTok/influencer.csv', index=False)
I need to pull data from an Odata API. With code below I do receive data, but only 250 rows.
The JSON contains a key called: #odata.nextLink that contains one value, this is the BASE_URL + endpoint + ?$skip=250
How can I loop through the next pages?
import requests
import pandas as pd
import json
BASE_URL = "base_url"
def session_token():
url = BASE_URL + '/api/oauth/token'
headers = {"Accept": "application\json",
"Content-Type": "application/x-www-form-urlencoded;charset=UTF-8"}
body = {"username":"user",
"password": "pwd",
"grant_type": "password"}
return "Bearer "+ requests.post(url, headers = headers, data = body).json()["access_token"]
def make_request(endpoint, token = session_token()):
headers = {"Authorization": token}
response = requests.get(BASE_URL + endpoint, headers = headers)
if response.status_code == 200:
json_data = json.loads(response.text)
return json_data
make_request("/odata/endpoint")
Following #Marek Piotrowski's advise I modified and came to a solution:
def main():
url = "endpoint"
while True:
if not url:
break
response = make_request("endpoint")
if response.status_code == 200:
json_data = json.loads(response.text)
url = json_data["#odata.nextLink"] # Fetch next link
yield json_data['value']
result = pd.concat((json_normalize(row) for row in main()))
print(result) # Final dataframe, works like a charm :)
Something like that would retrieve all records, I believe (assuming there's #odata.nextLink in json_data indeed):
def retrieve_all_records(endpoint, token = session_token()):
all_records = []
headers = {"Authorization": token}
url = BASE_URL + endpoint
while True:
if not url:
break
response = requests.get(url, headers = headers)
if response.status_code == 200:
json_data = json.loads(response.text)
all_records = all_records + json_data['records']
url = json_data['#odata.nextLink']
return all_records
The code is untested, though. Let me know if it works. Alternatively, you could make some recursive call to make_request, I believe, but you'd have to store results somewhere above the function itself then.
I know that this is late, but you could look at this article from Towards Data Science of Ephram Mwai
He pretty solved the problem with a good script.
There is a website I need to scrape, but before I do I need to login.
There seems to be three things I need to get in, the username, password and authenticity token. The user name and password I know, but I am not sure how to access the token.
This is what I have tried:
import requests
from lxml import html
login_url = "https://urs.earthdata.nasa.gov/home"
session_requests = requests.session()
result = session_requests.get(login_url)
tree = html.fromstring(result.text)
authenticity_token = list(set(tree.xpath("//input[#name='authenticity_token']/#value")))[0]
payload = {"username": "my_name",
"password": "my_password",
"authenticity_token": authenticity_token}
result = session_requests.post(
login_url,
data = payload,
headers = dict(referer=login_url)
)
print (result)
This results in :
<Response [404]>
My name and password are entered correctly so it is the token that must be going wrong. I think the problem is this line:
authenticity_token = list(set(tree.xpath("//input[#name='authenticity_token']/#value")))[0]
or this line:
payload = {"username": "my_name",
"password": "my_password",
"authenticity_token": authenticity_token}
by looking at the source code on the webpage I noticed there is a authenticity_token, csrf-token and a csrf-param. So its possible these are in the wrong order, but I tried all the combinations.
EDIT:
Here is a beautiful soup approach that results in 404 again.
s = requests.session()
response = s.get(login_url)
soup = BeautifulSoup(response.text, "lxml")
for n in soup('input'):
if n['name'] == 'authenticity_token':
token = n['value']
if n['name'] == 'utf8':
utf8 = n['value']
break
auth = {
'username': 'my_username'
, 'password': 'my_password'
, 'authenticity_token': token
, 'utf8' : utf8
}
s.post(login_url, data=auth)
If you inspect the page you'll notice that form action value is '/login', so you have to submit your data to https://urs.earthdata.nasa.gov/login'.
login_url = "https://urs.earthdata.nasa.gov/login"
home_url = "https://urs.earthdata.nasa.gov/home"
s = requests.session()
soup = BeautifulSoup(s.get(home_url).text, "lxml")
data = {i['name']:i.get('value', '') for i in soup.find_all('input')}
data['username'] = 'my_username'
data['password'] = 'my_password'
result = s.post(login_url, data=data)
print(result)
< Response [200]>
A quick example with selenium:
from selenium import webdriver
driver = webdriver.Firefox()
url = 'https://n5eil01u.ecs.nsidc.org/MOST/MOD10A1.006/'
driver.get(url)
driver.find_element_by_name('username').send_keys('my_username')
driver.find_element_by_name('password').send_keys('my_password')
driver.find_element_by_id('login').submit()
html = driver.page_source
driver.quit()
I am following this tutorial but I can't seem to get any data when I am running the python. I get an HTTP status code of 200 and status.ok returns a true value. Any help would be great. This is what my response looks like in Terminal:
[]
200
True
import requests
from lxml import html
USERNAME = "username#email.com"
PASSWORD = "legitpassword"
LOGIN_URL = "https://bitbucket.org/account/signin/?next=/"
URL = "https://bitbucket.org/dashboard/overview"
def main():
session_requests = requests.session()
# Get login csrf token
result = session_requests.get(LOGIN_URL)
tree = html.fromstring(result.text)
authenticity_token = list(set(tree.xpath("//input[#name='csrfmiddlewaretoken']/#value")))[0]
# Create payload
payload = {
"username": USERNAME,
"password": PASSWORD,
"csrfmiddlewaretoken": authenticity_token
}
# Perform login
result = session_requests.post(LOGIN_URL, data = payload, headers = dict(referer = LOGIN_URL))
# Scrape url
result = session_requests.get(URL, headers = dict(referer = URL))
tree = html.fromstring(result.content)
bucket_elems = tree.findall(".//span[#class='repo-name']")
bucket_names = [bucket_elem.text_content().replace("\n", "").strip() for bucket_elem in bucket_elems]
print bucket_names
print result.status_code
if __name__ == '__main__':
main()
The xpath is wrong, there is no span with the class repo-name, you can get the repo names from the anchor tags with:
bucket_elems = tree.xpath("//a[#class='execute repo-list--repo-name']")
bucket_names = [bucket_elem.text_content().strip() for bucket_elem in bucket_elems]
The html has obviously changed since the tutorial was written.
def get_main_page_url("https://malwr.com/analysis/search/", strDestPath, strMD5):
base_url = 'https://malwr.com/'
url = 'https://malwr.com/account/login/'
username = 'myname'
password = 'pswd'
session = requests.Session()
# getting csrf value
response = session.get(url)
soup = bs4.BeautifulSoup(response.content)
form = soup.form
csrf = form.find('input', attrs={'name': 'csrfmiddlewaretoken'}).get('value')
## csrf1 = form.find('input', attrs ={'name': 'search'}).get('value')
# logging in
data = {
'username': username,
'password': password,
'csrfmiddlewaretoken': csrf
}
session.post(url, data=data)
# getting analysis data
response = session.get(urlparameter)
soup = bs4.BeautifulSoup(response.content)
form = soup.form
csrf = form.find('input', attrs={'name': 'csrfmiddlewaretoken'}).get('value')
## csrf1 = form.find('input', attrs ={'name': 'search'}).get('value')
data = {
'search': strMD5,
'csrfmiddlewaretoken': csrf
}
session.post(urlparameter, data = data)
response = session.get(urlparameter)
soup = bs4.BeautifulSoup(response.content)
print(soup)
if(None != soup.find('section', id='file').find('table')('tr')[-1].a):
link = soup.find('section', id='file').find('table')('tr')[-1].a.get('href')
link = urljoin(base_url, link)
webFile = session.get(link)
filename =link.split('/')[-2]
filename = arg + filename
localFile = open(filename, 'wb')
localFile.write(webFile.content)
webFile.close()
localFile.close()
I am able to login by searching crftoken. Then I am trying to send MD5 to search on malware.com, however I am not able to get the page that searches the sent MD5 to page.
I want to search the MD5 that we passes through crftoken.
Please let me know what is the wrong in code.
You've done almost everything correctly. Except that you need to pass the result of the POST request to BeautifulSoup. Replace:
session.post(urlparameter, data = data)
response = session.get(urlparameter)
with:
response = session.post(urlparameter, data=data)
Worked for me (I had an account at malwr).