I write a script for web scraping and it is successfully scraping data. Only problem is with exporting data to JSON file
def scrape_post_info(url):
content = get_page_content(url)
title, description, post_url = get_post_details(content, url)
job_dict = {}
job_dict['title'] = title
job_dict['Description'] = description
job_dict['url'] = post_url
#here json machanism
json_job = json.dumps(job_dict)
with open('data.json', 'r+') as f:
f.write("[")
f.seek(0)
f.write(json_job)
txt = f.readline()
if txt.endswith("}"):
f.write(",")
def crawl_web(url):
while True:
post_url = get_post_url(url)
for urls in post_url:
urls = urls
scrape_post_info(urls)
# Execute the main fuction 'crawl_web'
if __name__ == '__main__':
crawl_web('www.examp....com')
The data is exported to JSON but it is not proper format of JSON. I am expecting the data should look like:
[
{
"title": "this is title",
"Description": " Fendi is an Italian luxury labelarin. ",
"url": "https:/~"
},
{
"title": " - Furrocious Elegant Style",
"Description": " the Italian luxare vast. ",
"url": "https://www.s"
},
{
"title": "Rome, Fountains and Fendi Sunglasses",
"Description": " Fendi started off as a store. ",
"url": "https://www.~"
},
{
"title": "Tipsnglasses",
"Description": "Whether irregular orn season.",
"url": "https://www.sooic"
},
]
How can I achieve this?
How about:
def scrape_post_info(url):
content = get_page_content(url)
title, description, post_url = get_post_details(content, url)
return {"title": title, "Description": description, "url": post_url}
def crawl_web(url):
while True:
jobs = []
post_urls = get_post_url(url)
for url in post_urls:
jobs.append(scrape_post_info(url))
with open("data.json", "w") as f:
json.dumps(jobs)
# Execute the main fuction 'crawl_web'
if __name__ == "__main__":
crawl_web("www.examp....com")
Note that this will rewrite your entire file on each iteration of "post_urls", so it might become quite slow with large files and slow I/O.
Depending on how long your job is running and how much memory you have, you might want to move the file writing out of the for loop, and only write it out once.
Note: if you really want to write JSON streaming, you might want to look at something like this package: https://pypi.org/project/jsonstreams/, however I'd suggest to choose another format such as CSV that is much more well suited to streaming writes.
Related
I have a script that takes an ID from a JSON file, adds it to a URL for an API request. The aim is to have a loop run through the 1000-ish ids and preduce one JSON file with all the information contained within.
The current code calls the first request and creates and populates the JSON file, but when run in a loop it throws a key error.
import json
import requests
fname = "NewdataTest.json"
def write_json(data, fname):
fname = "NewdataTest.json"
with open(fname, "w") as f:
json.dump(data, f, indent = 4)
with open (fname) as json_file:
data = json.load(json_file)
temp = data[0]
#print(newData)
y = newData
data.append(y)
# Read test.json to get tmdb IDs
tmdb_ids = []
with open('test.json', 'r') as json_fp:
imdb_info = json.load(json_fp)
tmdb_ids = [movie_info['tmdb_id'] for movies_chunk in imdb_info for movie_index, movie_info in movies_chunk.items()]
# Add IDs to API call URL
for tmdb_id in tmdb_ids:
print("https://api.themoviedb.org/3/movie/" + str(tmdb_id) + "?api_key=****")
# Send API Call
response_API = requests.get("https://api.themoviedb.org/3/movie/" + str(tmdb_id) + "?api_key=****")
# Check API Call Status
print(response_API.status_code)
write_json((response_API.json()), "NewdataTest.json")
The error is in this line "temp = data[0]" I have tried printing the keys for data, nothing. At this point, I have no idea where I am with this as I have hacked it about it barely resembles anything like a cohesive piece of code. My aim was to make a simple function to get the data from the JSON, one to produce the API call URLs, and one to write the results to the new JSON.
Example of API reponse JSON:
{
"adult": false,
"backdrop_path": "/e1cC9muSRtAHVtF5GJtKAfATYIT.jpg",
"belongs_to_collection": null,
"budget": 0,
"genres": [
{
"id": 10749,
"name": "Romance"
},
{
"id": 35,
"name": "Comedy"
}
],
"homepage": "",
"id": 1063242,
"imdb_id": "tt24640474",
"original_language": "fr",
"original_title": "Disconnect: The Wedding Planner",
"overview": "After falling victim to a scam, a desperate man races the clock as he attempts to plan a luxurious destination wedding for an important investor.",
"popularity": 34.201,
"poster_path": "/tGmCxGkVMOqig2TrbXAsE9dOVvX.jpg",
"production_companies": [],
"production_countries": [
{
"iso_3166_1": "KE",
"name": "Kenya"
},
{
"iso_3166_1": "NG",
"name": "Nigeria"
}
],
"release_date": "2023-01-13",
"revenue": 0,
"runtime": 107,
"spoken_languages": [
{
"english_name": "English",
"iso_639_1": "en",
"name": "English"
},
{
"english_name": "Afrikaans",
"iso_639_1": "af",
"name": "Afrikaans"
}
],
"status": "Released",
"tagline": "",
"title": "Disconnect: The Wedding Planner",
"video": false,
"vote_average": 5.8,
"vote_count": 3
}
You can store all results from the API calls into a list and then save this list in Json format into a file. For example:
#...
all_data = []
for tmdb_id in tmdb_ids:
print("https://api.themoviedb.org/3/movie/" + str(tmdb_id) + "?api_key=****")
# Send API Call
response_API = requests.get("https://api.themoviedb.org/3/movie/" + str(tmdb_id) + "?api_key=****")
# Check API Call Status
print(response_API.status_code)
if response_API.status_code == 200:
# store the Json data in a list:
all_data.append(response_API.json())
# write the list to file
with open('output.json', 'w') as f_out:
json.dump(all_data, f_out, indent=4)
This will produce output.json with all responses in Json format.
I wrote a web scraping script and it is working great. I am trying to write the scraped data to json file but i failed.
this is my snippet:
def scrape_post_info(url):
content = get_page_content(url)
title, description, post_url = get_post_details(content, url)
job_dict = {}
job_dict['title'] = title
job_dict['Description'] = description
job_dict['url'] = post_url
json_job = json.dumps(job_dict)
with open('data.json', 'a') as f:
json.dump(json_job, f)
if __name__ == '__main__':
urls = ['url1', 'url2', 'url3', 'url4']
for url in urls:
scrape_post_info(url)
ignore two function i called inside the function, problem not with them
My problem only is writing to json.
Currently i am getting the scraped data like this below and there are wrong format
data.json are below:
{
"title": "this is title",
"Description": " Fendi is an Italian luxury labelarin. ",
"url": "https:/~"
}
{
"title": " - Furrocious Elegant Style",
"Description": " the Italian luxare vast. ",
"url": "https://www.s"
}
{
"title": "Rome, Fountains and Fendi Sunglasses",
"Description": " Fendi started off as a store. ",
"url": "https://www.~"
}
{
"title": "Tipsnglasses",
"Description": "Whether irregular orn season.",
"url": "https://www.sooic"
}
but it should be like these:
[
{
"title": "this is title",
"Description": " Fendi is an Italian luxury labelarin. ",
"url": "https:/~"
},
{
"title": " - Furrocious Elegant Style",
"Description": " the Italian luxare vast. ",
"url": "https://www.s"
},
{
"title": "Rome, Fountains and Fendi Sunglasses",
"Description": " Fendi started off as a store. ",
"url": "https://www.~"
},
{
"title": "Tipsnglasses",
"Description": "Whether irregular orn season.",
"url": "https://www.sooic"
},
]
I am not getting exactly why i am not getting data in json file in proper formate..
Can anyone help me in this?
You can try this code to solve your problem.
you will get exact file as you expected above, following is the code:
import json
def scrape_post_info(url, f):
content = get_page_content(url)
title, description, post_url = get_post_details(content, url)
job_dict = {}
job_dict['title'] = title
job_dict['Description'] = description
job_dict['url'] = post_url
json_job = json.dumps(job_dict)
f.seek(0)
txt = f.readline()
if txt.endswith("}"):
f.write(",")
f.write(json_job)
if __name__ == '__main__':
urls = ['url1', 'url2', 'url3', 'url4']
with open('data.json', 'r+') as f:
f.write("[")
for url in urls:
scrape_post_info(url,f)
f.write("]")
I have around 1000 JSON files in a folder, I want to convert all those files to its CSV formats. The sample of JSON file is given below.
{"Reviews":
[
{"Title": "Don't give up on your NOOK HD just yet - make it a Lean Mean Jellybean with OS 4.2.2",
"Author": "DC10",
"ReviewID": "ROX6OFU4UAOK1",
"Overall": "5.0",
"Content": "Hi Folks, ",
"Date": "February 18, 2013"},
{"Title": "freezing problem",
"Author": "joseph",
"ReviewID": "R1QVAPUULQZ57B",
"Overall": "3.0",
"Content": "I am still setting it up the way I want it I havve downloaded anything to it yet and it freezes horribly. All in all tho I love this device.",
"Date": "September 11, 2013"}
],
"ProductInfo": {"Price": "$229.00", "Features": "NOOK HD 7\" 16GB Tablet", "Name": "NOOK HD 7\" 16GB Tablet",
"ImgURL": "http://ecx.images-amazon.com/images/I/41jpVvVz41L._SY300_.jpg",
"ProductID": "1400501520"}}
example: json to csv
imports:
import json
import csv
some json :D
data_json = {
"employee_details": [{
"nom": "Bobby",
"age": "19",
}]
}
and some code
employee_data = data_json['employee_details']
# open a file for writing
employ_data = open('EmployData.csv', 'w')
# create the csv writer object
csvwriter = csv.writer(employ_data)
count = 0
for employee in employee_data:
if count == 0:
header = employee.keys()
csvwriter.writerow(header)
count += 1
csvwriter.writerow(employee.values())
employ_data.close()
you will find the file EmployeeData.csv next to your script
If you're willing to install a 3rd party module, you can do this in a few lines with pandas
for json_file in files:
with open(json_file) as f:
json_data = json.load(f)
df = pandas.DataFrame.from_records(json_data)
df.to_csv(json_file.rstrip('.json') + '.csv')
I have the following Python code:
import requests
import json
from bs4 import BeautifulSoup
url = requests.get('https://www.perfectimprints.com/custom-promos/20492/Beach-Balls.html')
source = BeautifulSoup(url.text, 'html.parser')
products = source.find_all('div', class_="product_wrapper")
def get_product_details(product):
product_name = product.find('div', class_="product_name").a.text
sku = product.find('div', class_="product_sku").text
product_link = product.find('div', class_="product_image_wrapper").find("a")["href"]
src = product.find('div', class_="product_image_wrapper").find('a').find("img")["src"]
return {
"title": product_name,
"link": product_link,
"sku": sku,
"src": src
}
all_products = [get_product_details(product) for product in products]
with open("products.json", "w") as write_file:
json.dump(all_products, write_file)
print("Success")
This code works perfectly as written. The problem is I want the structure instead of
[
{
"title": "12\" Beach Ball",
"link": "/promos/PI-255-751/12-Beach-Ball.html?cid=20492",
"sku": " \n\t\t\t\t#PI-255-751\n\t\t\t",
"src": "https://12f598f3b6e7e912e4cd-a182d9508ed57781ad8837d0e4f7a945.ssl.cf5.rackcdn.com/thumb/751_group.jpg"
},
]
I want it to be:
{
"items": [
{
"title": "12\" Beach Ball",
"link": "/promos/PI-255-751/12-Beach-Ball.html?cid=20492",
"sku": " \n\t\t\t\t#PI-255-751\n\t\t\t",
"src": "https://12f598f3b6e7e912e4cd-a182d9508ed57781ad8837d0e4f7a945.ssl.cf5.rackcdn.com/thumb/751_group.jpg"
},
]
}
Here's a link to what I have working in Repl.it, just so you don't have to set up your own: https://repl.it/repls/AttractiveDimpledTheory
Side note: Would love to also be able to remove all of the \n and \t in the skus if possible.
Here you're dumping your all_products list directly to JSON:
with open("products.json", "w") as write_file:
json.dump(all_products, write_file)
The JSON you want just has that list in an object. Something like
with open("products.json", "w") as write_file:
json.dump({'items': all_products}, write_file)
should do what you want.
Generally speaking there's a 1:1 relationship between your Python data structure and the JSON it generates. If you build the right Python data structure you'll get the right JSON. Here we're using a dict (which maps to a JSON object) to wrap your existing list (which maps to a JSON array).
Side note: Would love to also be able to remove all of the \n and \t in the skus if possible.
Assuming you also want to remove spaces, you can just use str.strip(), which strips whitespace by default:
return {
"title": product_name,
"link": product_link,
"sku": sku.strip(), # <-- here
"src": src
}
I am having trouble with converting a dictionary to a string in python. I am trying to extract the information from one of my variables but cannot seem to remove the square brackets surrounding the information
for line in str(object):
if line.startswith ('['):
new_object = object.replace('[', '')
Is there a way to remove the square brackets or do I have to find another way of taking the information out of the dictionary?
Edit:
in more detail what i am trying to do here is the following
import requests
city = 'dublin'
country = 'ireland'
info = requests.get('http://api.openweathermap.org/data/2.5/weather?q='+city +','+ country +'&mode=json')
weather = info.json()['weather']
fh = open('/home/Ricky92d3/city.txt', 'w')
fh.write(str(weather))
fh.close()
fl = open('/home/Ricky92d3/city.txt')
Object = fl.read()
fl.close()
for line in str(Object):
if line.startswith ('['):
new_Object = Object.replace('[', '')
if line.startswith ('{'):
new_Object = Object.replace('{u', '')
print new_Object
i hope this makes what i am trying to do a little more clear
The object returned by info.json() is a Python dictionary, so you can access its contents using normal Python syntax. I admit that it can get a little bit tricky, since JSON dictionaries often contain other dictionaries and lists, but it's generally not too hard to figure out what's what if you print the JSON object out in a nicely formatted way. The easiest way to do that is by using the dumps() function in the standard Python json module.
The code below retrieves the JSON data into a dict called data.
It then prints the 'description' string from the list in the 'weather' item of data.
It then saves all the data (not just the 'weather' item) as an ASCII-encoded JSON file.
It then reads the JSON data back in again to a new dict called newdata, and pretty-prints it.
Finally, it prints the weather description again, to verify that we got back what we saw earlier. :)
import requests, json
#The base URL of the weather service
endpoint = 'http://api.openweathermap.org/data/2.5/weather'
#Filename for saving JSON data to
fname = 'data.json'
city = 'dublin'
country = 'ireland'
params = {
'q': '%s,%s' % (city, country),
'mode': 'json',
}
#Fetch the info
info = requests.get(endpoint, params=params)
data = info.json()
#print json.dumps(data, indent=4)
#Extract the value of 'description' from the list in 'weather'
print '\ndescription: %s\n' % data['weather'][0]['description']
#Save data
with open(fname, 'w') as f:
json.dump(data, f, indent=4)
#Reload data
with open(fname, 'r') as f:
newdata = json.load(f)
#Show all the data we just read in
print json.dumps(newdata, indent=4)
print '\ndescription: %s\n' % data['weather'][0]['description']
output
description: light intensity shower rain
{
"clouds": {
"all": 75
},
"name": "Dublin",
"visibility": 10000,
"sys": {
"country": "IE",
"sunset": 1438374108,
"message": 0.0118,
"type": 1,
"id": 5237,
"sunrise": 1438317600
},
"weather": [
{
"description": "light intensity shower rain",
"main": "Rain",
"id": 520,
"icon": "09d"
}
],
"coord": {
"lat": 53.340000000000003,
"lon": -6.2699999999999996
},
"base": "stations",
"dt": 1438347600,
"main": {
"pressure": 1014,
"humidity": 62,
"temp_max": 288.14999999999998,
"temp": 288.14999999999998,
"temp_min": 288.14999999999998
},
"id": 2964574,
"wind": {
"speed": 8.1999999999999993,
"deg": 210
},
"cod": 200
}
description: light intensity shower rain
I'm not quite sure what you're trying to do here (without seeing your dictionary) but if you have a string like x = "[myString]" you can just do the following:
x = x.replace("[", "").replace("]", "")
If this isn't working, there is a high chance you're actually getting a list returned. Though if that was the case you should see an error like this:
>>> x = [1,2,3]
>>> x.replace("[", "")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
AttributeError: 'list' object has no attribute 'replace'
Edit 1:
I think there's a misunderstanding of what you're getting back here. If you're just looking for a csv output file with the weather from your api try this:
import requests
import csv
city = 'dublin'
country = 'ireland'
info = requests.get('http://api.openweathermap.org/data/2.5/weather?q='+city +','+ country +'&mode=json')
weather = info.json()['weather']
weather_fieldnames = ["id", "main", "description", "icon"]
with open('city.txt', 'w') as f:
csvwriter = csv.DictWriter(f, fieldnames=weather_fieldnames)
for w in weather:
csvwriter.writerow(w)
This works by looping through the list of items you're getting and using a csv.DictWriter to write it as a row in the csv file.
Bonus
Don't call your dictionary object - It's a reserved word for the core language.