How can I make TikTok scraper scrape and add to CSV - python

I am building this scraper and I am trying to scrape TikTok hastags, then get the username from that hashtag, followed by scarping each username that I previously scraped. When finished want the information to be organized in a csv file. This is what I came up with but it is not working as I expected. I am a beginner and I am trying to learn a new language. What am I missing and doing wrong?
import requests
import json
import pandas as pd
# scrape hastag
url = "https://api.tikapi.io/public/hashtag?count=30&id=9261"
payload={}
headers = {
'X-API-KEY': 'xxxxxx'
}
response = requests.request("GET", url, headers=headers, data=payload)
hashtag_response = response.text
hashtag_json = json.loads (hashtag_response)
# write data to hashtag json file
results_json = open("data.json", "w")
L = [response.text]
results_json.writelines(L)
results_json.close()
# list
influencer = []
followerCount = []
bioLink = []
signature = []
for uniqueId in hashtag_json ['uniqueId']:
influencer.append(uniqueId)
# scrape influencer username
url = "https://api.tikapi.io/public/check?username={influencer}"
payload={}
headers = {
'X-API-KEY': 'xxxxx'
}
influencerresponse = requests.request("GET", url, headers=headers, data=payload)
infl_response = influencerresponse.text
influencer_json = json.loads (infl_response)
# write data to influencer json file
results_json = open("infl_data.json", "w")
I = [influencerresponse.text]
results_json.writelines(I)
results_json.close()
for followerCount, bioLink in influencer_json ['followerCount','bioLink','signature']:
followerCount.append(followerCount)
bioLink.append(bioLink)
signature.append(signature)
# create csv file of results
influencer_df = pd.DataFrame({
'Influencer' : influencer,
'Follower Count' : followerCount,
'Link' : bioLink,
'Signature' : signature,
})
influencer_df.to_csv('/Users/john/Repos/TikTok/influencer.csv', index=False)

you were wrong in this part
for uniqueId in hashtag_json ['uniqueId']:
influencer.append(uniqueId)
should be
influencer.append(hashtag_json["itemList"][0]['author']['uniqueId'])
and in this part
for followerCount, bioLink in influencer_json ['followerCount','bioLink','signature']:
should be
followerCount.append(influencer_json['userInfo']['stats']['followerCount'])
bioLink.append(influencer_json['userInfo']['user']['bioLink']['link'])
signature.append(influencer_json['userInfo']['user']['signature'])
.
import requests
import json
import pandas as pd
# scrape hastag
url = "https://api.tikapi.io/public/hashtag?count=30&id=9261"
payload={}
headers = {
'X-API-KEY': 'xxxx'
}
response = requests.request("GET", url, headers=headers, data=payload)
hashtag_response = response.text
hashtag_json = json.loads(hashtag_response)
# write data to hashtag json file
results_json = open("data.json", "w")
L = [response.text]
results_json.writelines(L)
results_json.close()
# list
influencer = []
followerCount = []
bioLink = []
signature = []
influencer.append(hashtag_json["itemList"][0]['author']['uniqueId'])
# scrape influencer username
url = "https://api.tikapi.io/public/check?username={}".format(influencer[0])
payload={}
headers = {
'X-API-KEY': 'xxxx'
}
influencerresponse = requests.request("GET", url, headers=headers, data=payload)
infl_response = influencerresponse.text
influencer_json = json.loads(infl_response)
# write data to influencer json file
results_json = open("infl_data.json", "w")
I = [influencerresponse.text]
results_json.writelines(I)
results_json.close()
followerCount.append(influencer_json['userInfo']['stats']['followerCount'])
bioLink.append(influencer_json['userInfo']['user']['bioLink']['link'])
signature.append(influencer_json['userInfo']['user']['signature'])
# create csv file of results
influencer_df = pd.DataFrame({
'Influencer' : influencer,
'Follower Count' : followerCount,
'Link' : bioLink,
'Signature' : signature,
})
influencer_df.to_csv('/Users/john/Repos/TikTok/influencer.csv', index=False)

Related

trying to make an Yelp API call with a list of business IDs

When I run the code it's giving me this syntax error:
requests.exceptions.MissingSchema: Invalid URL 'h': No scheme supplied. Perhaps you meant http://h?
Here is the code I am working with:
from yelp_api_key import YELP_KEY
from yelp_api_location import loc_ids
MY_API_KEY = YELP_KEY
BUSINESS_PATH = f'https://api.yelp.com/v3/businesses/{loc_ids}/reviews'
HEADERS = {'Authorization': 'bearer %s' % MY_API_KEY}
PARAMETERS = {'locale': 'en_US'
}
for links in BUSINESS_PATH:
response = requests.get (url=links,
params=PARAMETERS,
headers=HEADERS)
business_data = response.json()
data = business_data['reviews']
print(data)
for x in data:
quotes = (x['text'])
print(quotes)
Below is the code that is working for me. I just want to be able to call multiple APIs without having to list the endpoints every time. Any suggestions would be great, TIA!
MY_API_KEY = YELP_KEY
BUSINESS_PATH = [f'https://api.yelp.com/v3/businesses/eL4d1tHv1mFoepoS_3rGbw/reviews',
f'https://api.yelp.com/v3/businesses/RzS-wNTycqB5WA34JfgW0g/reviews',
f'https://api.yelp.com/v3/businesses/PyV1e_OebaWm1cGUwtDvHA/reviews',
f'https://api.yelp.com/v3/businesses/dcbALMl6oyv_fdJ6dZGxzA/reviews',
f'https://api.yelp.com/v3/businesses/4uRA53NIl82a3QeZX-PcRw/reviews']
HEADERS = {'Authorization': 'bearer %s' % MY_API_KEY}
PARAMETERS = {'locale': 'en_US'
}
reviews = []
for links in BUSINESS_PATH:
# file_name = uuid.uuid1 ()
response = requests.get (url=links,
params=PARAMETERS,
headers=HEADERS)
business_data = response.json()
data = business_data['reviews']
for x in data:
quotes = (x['text'])
# print(quotes)
reviews.append(quotes)

Problem with response status code saying response is not defined

Basically, I am trying to pass a list of ids in payloads of 100 from a spreadsheet to delete organizations using the destroy many endpoint.
import json
import xlrd
import requests
session = requests.Session()
session.headers = {'Content-Type': 'application/json'}
session.auth = 'my email', 'password'
url = 'https://domain.zendesk.com/api/v2/organizations/destroy_many.json'
payloads = []
organizations_dict = {}
book = xlrd.open_workbook('orgs_list_destroy.xls')
sheet = book.sheet_by_name('Sheet1')
for row in range(1, sheet.nrows):
if sheet.row_values(row)[2]:
organizations_dict = {'ids': int(sheet.row_values(row)[2])}
if len(organizations_dict) == 100:
payloads.append(json.dumps(organizations_dict))
organizations_dict = {}
if organizations_dict:
payloads.append(json.dumps(organizations_dict))
for payload in payloads:
response = session.delete(url, data=payload)
if response.status_code != 200:
print('Import failed with status {}'.format(response.status_code))
exit()
print('Successfully imported a batch of organizations')
Try placing it outside the for loop, where you're defining your request headers:
url = 'https://{{YOURDOMAIN}}.zendesk.com/api/v2/organizations/destroy_many.json'
user = 'YOUR_EMAIL#DOMAIN.com' + '/token'
pwd = '{{YOUR_TOKEN}}'
headers = {'Content-Type': 'application/json'}
response = requests.delete(url, auth=(user, pwd), headers=headers)

Python: how to extract data from Odata API that contains pages #odata.nextLink

I need to pull data from an Odata API. With code below I do receive data, but only 250 rows.
The JSON contains a key called: #odata.nextLink that contains one value, this is the BASE_URL + endpoint + ?$skip=250
How can I loop through the next pages?
import requests
import pandas as pd
import json
BASE_URL = "base_url"
def session_token():
url = BASE_URL + '/api/oauth/token'
headers = {"Accept": "application\json",
"Content-Type": "application/x-www-form-urlencoded;charset=UTF-8"}
body = {"username":"user",
"password": "pwd",
"grant_type": "password"}
return "Bearer "+ requests.post(url, headers = headers, data = body).json()["access_token"]
def make_request(endpoint, token = session_token()):
headers = {"Authorization": token}
response = requests.get(BASE_URL + endpoint, headers = headers)
if response.status_code == 200:
json_data = json.loads(response.text)
return json_data
make_request("/odata/endpoint")
Following #Marek Piotrowski's advise I modified and came to a solution:
def main():
url = "endpoint"
while True:
if not url:
break
response = make_request("endpoint")
if response.status_code == 200:
json_data = json.loads(response.text)
url = json_data["#odata.nextLink"] # Fetch next link
yield json_data['value']
result = pd.concat((json_normalize(row) for row in main()))
print(result) # Final dataframe, works like a charm :)
Something like that would retrieve all records, I believe (assuming there's #odata.nextLink in json_data indeed):
def retrieve_all_records(endpoint, token = session_token()):
all_records = []
headers = {"Authorization": token}
url = BASE_URL + endpoint
while True:
if not url:
break
response = requests.get(url, headers = headers)
if response.status_code == 200:
json_data = json.loads(response.text)
all_records = all_records + json_data['records']
url = json_data['#odata.nextLink']
return all_records
The code is untested, though. Let me know if it works. Alternatively, you could make some recursive call to make_request, I believe, but you'd have to store results somewhere above the function itself then.
I know that this is late, but you could look at this article from Towards Data Science of Ephram Mwai
He pretty solved the problem with a good script.

API POST Call throws 401 error when used pytest

I am performing the API Testing and using the pytest framework. Test is failing all the time with 401 error. Couldn't figure out what was the issue.
Here is the code :
import requests
import json,jsonpath
import urllib3
import constants
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# variables
dumpFile = "somepath"
url = "someUrl"
headers = {'Authorization' : constants.consts['siteToken'],
'accept':'application/json',
'content-type':'application/json'}
#siteToken = 'Bearer jwt token'
# read json input file
input_file = open("json file path", 'r')
json_input = input_file.read()
request_json = json.loads(json_input)
# make POST request with JSON Input Body
r = requests.post(url, request_json, headers=headers)
# Verification of the response
assert r.status_code == 200
def test_json_result():
# fetch header from response
print(r.headers.get("Date"))
# parse response to JSON Format
response_json = json.loads(r.text)
# validate response using Json Path
name = jsonpath.jsonpath(response_json, 'name')
print(name)
I solved this by just putting json=your_payload.
import requests
import json,jsonpath
import urllib3
import constants
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# variables
dumpFile = "somepath"
url = "someUrl"
headers = {'Authorization' : constants.consts['siteToken'],
'accept':'application/json',
'content-type':'application/json'}
#siteToken = 'Bearer jwt token'
# read json input file
input_file = open("json file path", 'r')
json_input = input_file.read()
request_json = json.loads(json_input)
def test_json_result():
# make POST request with JSON Input Body
r = requests.post(url, json=request_json, headers=headers)
# Verification of the response
assert r.status_code == 200
# fetch header from response
print(r.headers.get("Date"))
# parse response to JSON Format
response_json = json.loads(r.text)
# validate response using Json Path
name = jsonpath.jsonpath(response_json, 'name')
print(name)

how to pass search key and get result through bs4

def get_main_page_url("https://malwr.com/analysis/search/", strDestPath, strMD5):
base_url = 'https://malwr.com/'
url = 'https://malwr.com/account/login/'
username = 'myname'
password = 'pswd'
session = requests.Session()
# getting csrf value
response = session.get(url)
soup = bs4.BeautifulSoup(response.content)
form = soup.form
csrf = form.find('input', attrs={'name': 'csrfmiddlewaretoken'}).get('value')
## csrf1 = form.find('input', attrs ={'name': 'search'}).get('value')
# logging in
data = {
'username': username,
'password': password,
'csrfmiddlewaretoken': csrf
}
session.post(url, data=data)
# getting analysis data
response = session.get(urlparameter)
soup = bs4.BeautifulSoup(response.content)
form = soup.form
csrf = form.find('input', attrs={'name': 'csrfmiddlewaretoken'}).get('value')
## csrf1 = form.find('input', attrs ={'name': 'search'}).get('value')
data = {
'search': strMD5,
'csrfmiddlewaretoken': csrf
}
session.post(urlparameter, data = data)
response = session.get(urlparameter)
soup = bs4.BeautifulSoup(response.content)
print(soup)
if(None != soup.find('section', id='file').find('table')('tr')[-1].a):
link = soup.find('section', id='file').find('table')('tr')[-1].a.get('href')
link = urljoin(base_url, link)
webFile = session.get(link)
filename =link.split('/')[-2]
filename = arg + filename
localFile = open(filename, 'wb')
localFile.write(webFile.content)
webFile.close()
localFile.close()
I am able to login by searching crftoken. Then I am trying to send MD5 to search on malware.com, however I am not able to get the page that searches the sent MD5 to page.
I want to search the MD5 that we passes through crftoken.
Please let me know what is the wrong in code.
You've done almost everything correctly. Except that you need to pass the result of the POST request to BeautifulSoup. Replace:
session.post(urlparameter, data = data)
response = session.get(urlparameter)
with:
response = session.post(urlparameter, data=data)
Worked for me (I had an account at malwr).

Categories