Reading the Developer's Guide I found how to delete single contact:
def delete_contact(gd_client, contact_url):
# Retrieving the contact is required in order to get the Etag.
contact = gd_client.GetContact(contact_url)
try:
gd_client.Delete(contact)
except gdata.client.RequestError, e:
if e.status == 412:
# Etags mismatch: handle the exception.
pass
Is there a way to delete all contacts?
Could not find a way to do so.
Iterate each contact takes few minutes for a large batch
If you are performing a lot of operations, use the batch requests. You can have the server perform multiple operations with a single HTTP request. Batch requests are limited to 100 operations at a time. You can find more information about batch operations in the Google Data APIs Batch Processing documentation.
To delete all contacts use the contactsrequest.Batch operation. For this operation, create a LIST<type>, you set the BatchData for each contact item, and then pass the list to the contactsrequest.Batch operation.
private void DeleteAllContacts()
{
RequestSettings rs = new RequestSettings(this.ApplicationName, this.userName, this.passWord);
rs.AutoPaging = true // this will result in automatic paging for listing and deleting all contacts
ContactsRequest cr = new ContactsRequest(rs);
Feed<Contact> f = cr.GetContacts();
List<Contact> list = new List<Contact>();
int i=0;
foreach (Contact c in f.Entries)
{
c.BatchData = new GDataBatchEntryData();
c..BatchData.Id = i.ToString();
c.BatchData.Type = GDataBatchOperationType.delete;
i++;
list.Add(c);
}
cr.Batch(list, new Uri(f.AtomFeed.Batch), GDataBatchOperationType.insert);
f = cr.GetContacts();
Assert.IsTrue(f.TotalResults == 0, "Feed should be empty now");
}
Related
I am trying to enter large number of data (13 Million rows) into Firebase Firestore, but it is taking forever to finish.
Currently,I am Inserting the data row by row using python, I tried to use multi-treading, but it is still very slow and it is not efficient (I have to stay connected into the Internet)
so, is there another way to insert a file into Firebase (a more efficient way to (batch) insert the data)?
This is the data format
[
{
'010045006031': {
'FName':'Ahmed',
'LName':'Aline'
}
},
{
'010045006031': {
'FName':'Ali',
'LName':'Adel'
}
},
{
'010045006031': {
'FName':'Osama',
'LName':'Luay'
}
}
]
This is the code that I am using
import firebase_admin
from firebase_admin import credentials, firestore
def Insert2DB(I):
doc_ref = db.collection('DBCoolect').document(I['M'])
doc_ref.set({"FirstName": I['FName'], "LastName": I['LName']}
cred = credentials.Certificate("ServiceAccountKey.json")
firebase_admin.initialize_app(cred)
db = firestore.client()
List = []
#List are read from File
List.append({'M': random(),'FName':'Ahmed','LName':'Aline'})
List.append({'M': random(),'FName':'Ali','LName':'Adel'})
List.append({'M': random(),'FName':'Osama','LName':'Luay'})
for item in List:
Insert2DB(item)
Thanks a lot ...
Firestore does not offer any way to "bulk update" documents. They have to be added individually. There is a facility to batch write, but that's limited to 500 documents per batch, and that's not likely to speed up your process by a large amount.
If you want to optimize the rate at which documents can be added, I suggest reading the documentation on best practices for read and write operations and designing for scale. All things considered, however, there is really no "fast" way to get 13 million documents into Firestore. You're going to be writing code to add each one individually. Firestore is not optimized for fast writes. It's optimized for fast reads.
Yes there is a way to bulk data add into firebase using python
def process_streaming(self, response):
for response_line in response.iter_lines():
if response_line:
json_response = json.loads(response_line)
sentiment = self.sentiment_model(json_response["data"]["text"])
self.datalist.append(self.post_process_data(json_response, sentiment))
It extract data from response and save it into list.
if we want to wait to loop for minutes and upload first objects of list into daatabase then add belove code after if loop.
if (self.start_time + 300 < time.time()):
print(f"{len(self.datalist)} data send to database")
self.batch_upload_data(self.datalist)
self.datalist = []
self.start_time = time.time() + 300
Final function look like this.
def process_streaming(self, response):
for response_line in response.iter_lines():
if response_line:
json_response = json.loads(response_line)
sentiment = self.sentiment_model(json_response["data"]["text"])
self.datalist.append(self.post_process_data(json_response, sentiment))
if (self.start_time + 300 < time.time()):
print(f"{len(self.datalist)} data send to database")
self.batch_upload_data(self.datalist)
self.datalist = []
self.start_time = time.time() + 300
I am a beginner in Python and am trying to use the webhose.io API to collect data from the web. The problem is that this crawler retrieves 100 objects from one JSON at a time, i.e., to retrieve 500 data, it is necessary to make 5 requests. When I use the API, I am not able to collect all the data at once. I was able to collect the first 100 results, but when going to the next request, an error occurs, the first post is repeated. Follow the code:
import webhoseio
webhoseio.config(token="Xxxxx")
query_params = {
"q": "trump:english",
"ts": "1498538579353",
"sort": "crawled"
}
output = webhoseio.query("filterWebContent", query_params)
x = 0
for var in output['posts']:
print output['posts'][x]['text']
print output['posts'][x]['published']
if output['posts'] is None:
output = webhoseio.get_next()
x = 0
Thanks.
Use the following:
while output['posts']:
for var in output['posts']:
print output['posts'][0]['text']
print output['posts'][0]['published']
output = webhoseio.get_next()
How can I print all the alarms names, instead of only 50, when using the function describe_alarms?
Code, using Python:
conn = boto.connect_cloudwatch()
alarms = conn.describe_alarms()
for item in alarms:
print item.name
Thanks.
Even though I am a bit late to the party, here is my solution (in Java). You have to get the next token and keep on asking for the Result in a loop until there is no next token, so it behaves like a pagination on a website
String nextToken = null;
List<MetricAlarm> metricAlarms = new ArrayList<>();
for (int i = 0; i < 100; i++) {
DescribeAlarmsRequest describeAlarmsRequest = new DescribeAlarmsRequest();
describeAlarmsRequest.setNextToken(nextToken);
describeAlarmsRequest.setMaxRecords(100);
DescribeAlarmsResult describeAlarmsResult = getClient().describeAlarms(describeAlarmsRequest);
List<MetricAlarm> metricAlarmsTmp = describeAlarmsResult.getMetricAlarms();
metricAlarms.addAll(metricAlarmsTmp);
nextToken = describeAlarmsResult.getNextToken();
logger.info("nextToken: {}", nextToken);
if (nextToken == null) {
break;
}
}
logger.info("metricAlarms size: {}", metricAlarms.size());
Of course there is room for improvement, e.g. create a while loop instead of a for loop.
UPDATE:
Here my refined version
String nextToken = null;
List<MetricAlarm> metricAlarms = new ArrayList<>();
while (nextToken != null || metricAlarms.size() == 0) {
DescribeAlarmsRequest describeAlarmsRequest = new DescribeAlarmsRequest().withNextToken(nextToken).withMaxRecords(100); // create the request
DescribeAlarmsResult describeAlarmsResult = getClient().describeAlarms(describeAlarmsRequest); // get the result
metricAlarms.addAll(describeAlarmsResult.getMetricAlarms()); // add new alarms to our list
nextToken = describeAlarmsResult.getNextToken(); // check if we have a nextToken
if (nextToken == null && cachedMetricAlarms.size() == 0) { // if we have no alarm in AWS we get inside the loop but we would never exit -> intercept that
break;
}
}
logger.info("metricAlarms size: {}", metricAlarms.size());
By default it returns 50. If you want more, set max_records=value and try.
Due to underlying AWS API implementation restriction, it will return a maximum of 100 alarms. Don't know if it is fixed now.
conn.describe_alarms(max_records=100)
Help on method describe_alarms in module boto.ec2.cloudwatch:
describe_alarms(self, action_prefix=None, alarm_name_prefix=None,
alarm_names=None, max_records=None, state_value=None, next_token=None)
:type max_records: int
:param max_records: The maximum number of alarm descriptions
to retrieve.
Here's a complete example of how to paginate through the records in order to guarantee that you retrieve all of the records rather then being limited by the max records on the Cloudwatch Alarms API:
alarmMaxRecords = 10
response = client.describe_alarms(
AlarmNamePrefix=prefix,
MaxRecords=alarmMaxRecords
)
alarmsItems = []
while response:
alarmsItems += response['MetricAlarms']
response = client.describe_alarms(AlarmNamePrefix=prefix, MaxRecords=alarmMaxRecords, NextToken=response['NextToken']) if 'NextToken' in response else None
for alarm in alarmsItems:
# Do something with the alarm
print(response['MetricAlarms'])
The above will retrieve 10 records at a time, but can be anything up to 100.
Or more simply using the paginate method provided by boto3:
import boto3
# Create CloudWatch client
cloudwatch = boto3.client('cloudwatch')
# List alarms of insufficient data through the pagination interface
paginator = cloudwatch.get_paginator('describe_alarms')
for response in paginator.paginate(AlarmNamePrefix=prefix, MaxRecords=alarmMaxRecords):
# Do something with the alarm
print(response['MetricAlarms'])
I am reading a large amount of data from an API provider. Once get the response, I need to scan through and repackage the data and put into App Engine datastore. A particular big account will contain ~50k entries.
Every time I get some entries from the API, I will store 500 entries as a batch in a temp table and send the processing task to a queue. In case too many tasks get jammed inside one queue, I use 6 queues in total:
count = 0
worker_number = 6
for folder, property in entries:
data[count] = {
# repackaging data here
}
count = (count + 1) % 500
if count == 0:
cache = ClientCache(parent=user_key, data=json.dumps(data))
cache.put()
params = {
'access_token': access_token,
'client_key': client.key.urlsafe(),
'user_key': user_key.urlsafe(),
'cache_key': cache.key.urlsafe(),
}
taskqueue.add(
url=task_url,
params=params,
target='dbworker',
queue_name='worker%d' % worker_number)
worker_number = (worker_number + 1) % 6
And the task_url will lead to the following:
logging.info('--------------------- Process File ---------------------')
user_key = ndb.Key(urlsafe=self.request.get('user_key'))
client_key = ndb.Key(urlsafe=self.request.get('client_key'))
cache_key = ndb.Key(urlsafe=self.request.get('cache_key'))
cache = cache_key.get()
data = json.loads(cache.data)
for property in data.values():
logging.info(property)
try:
key_name = '%s%s' % (property['key1'], property['key2'])
metadata = Metadata.get_or_insert(
key_name,
parent=user_key,
client_key=client_key,
# ... other info
)
metadata.put()
except StandardError, e:
logging.error(e.message)
All the tasks are running in the backend.
With such structure, it's working fine. well... most of time. But sometimes I get this error:
2013-09-19 15:10:07.788
suspended generator transaction(context.py:938) raised TransactionFailedError(The transaction could not be committed. Please try again.)
W 2013-09-19 15:10:07.788
suspended generator internal_tasklet(model.py:3321) raised TransactionFailedError(The transaction could not be committed. Please try again.)
E 2013-09-19 15:10:07.789
The transaction could not be committed. Please try again.
It seems to be the problem of writing into datastore too frequently? I want to find out how I can balance the pace and let the worker run smoothly...
Also is there any other way I can improve the performance further? My queue configuration is something like this:
- name: worker0
rate: 120/s
bucket_size: 100
retry_parameters:
task_retry_limit: 3
You are writing single entities at a time.
How about modifing your code to write in batches using ndb.put_multi that will reduce the round trip time for each transaction.
And why are you using get_or_insert as you are overwriting the record each time. You might as well just write. Both of these will reduce the workload a lot
I have attempted to clean up and revise code in an answer here for my needs where I only want to delete from the Model Reservations for data records prior to the date expressed in the get as yy,mm,dd.
If I am correctly anticipating the action of cleanTable/2012/10/5 against the routing ('/cleanTable/([\d]+)/([\d]+)/([\d]+)', CleanTable) then my code would only delete at most 50 (10*nlimit) data records.
Btw, the author of the original code (who likely no longer subscribes to SO), claimed his main trick for accomplishing this code was "to include redirect in html instead of using self.redirect".
I am unfamiliar with raise Exception and the like, but my instinct would be to add a raise Exception or raise StopIteration to the for loop after it is made into a while loop. But it is not clear to me whether raising an StopIteration exception actually causes iteration to stop or if more is needed. Also, I don't know how to revise so the html ends smoothly upon early exit.
class CleanTable(BaseHandler):
def get(self, yy,mm,dd):
nlimit=5
iyy=int(yy)
imm=int(mm)
idd=int(dd)
param=date(iyy,imm,idd)
q=Reservations.all(keys_only=True)
q.filter("date < ", dt(iyy,imm,idd))
results = q.fetch(nlimit)
self.response.headers['Content-Type'] = 'text/plain'
self.response.out.write("""
<html>
<meta HTTP-EQUIV="REFRESH" content="url=http://yourapp.appspot.com/cleanTable">
<body>""")
try:
for i in range(10):
db.delete(results)
results = q.fetch(nlimit, len(results))
for r in results:
logging.info("r.name: %s" % r.name)
self.response.out.write("<p> "+str(nlimit)+" removed</p>")
self.response.out.write("""
</body>
</html>""")
except Exception, inst:
logging.info("inst: %s" % inst)
self.response.out.write(str(inst))
This is not the best approach to clean your models. A better approach would be to get all the keys of your entities and create Task Queues. Each queue will get a batch of keys for the entities that need to be modified.
Another approach would also be to create a cron job that will query for the x number of oldest modified entities, fix them and then store them back.
Finally, if your number of entities is so huge, you could also consider the use of Backends.
Hope this helps.
Here is my update routine and it has converted 500.000 entities. Be sure to run it on a backend instance (You can target a Queue to a backend instance). Notice that I am using a cursor, thats the only way you can consistently iterate through data (Never use offset!).
Queue queue = QueueFactory.getQueue("grinderQueue");
queue.add(TaskOptions.Builder.withPayload(new DeferredTask() { //lets generate
private static final long serialVersionUID = 1L;
#Override
public void run() {
String cursor = null;
boolean done = false;
Date now = new Date(1346763868L * 1000L); // 09/04/2012
while(!done) {
DatastoreService datastore = DatastoreServiceFactory.getDatastoreService();
Query query = new Query("Venue");
query.setFilter(new FilterPredicate("timeOfLastUpdate", Query.FilterOperator.LESS_THAN,now));
PreparedQuery pq = datastore.prepare(query);
FetchOptions fetchOptions = FetchOptions.Builder.withLimit(1000);
if(cursor != null)
fetchOptions.startCursor(Cursor.fromWebSafeString(cursor));
QueryResultList<Entity> results = pq.asQueryResultList(fetchOptions);
List<Entity> updates = new ArrayList<Entity>();
List<Entity> oldVenueUpdates = new ArrayList<Entity>();
int tuples = 0;
for(Entity en : results) {
tuples++;
try {
if(en.getProperty(Venue.VENUE_KEY) == null)
continue;
Entity newVenue = new Entity("CPVenue",(String)en.getProperty(Venue.VENUE_KEY));
newVenue.setPropertiesFrom(en);
newVenue.removeProperty("timeOfLastVenueScoreCalculation");
newVenue.removeProperty("actionsSinceLastVenueScoreCalculation");
newVenue.removeProperty("venueImageUrl");
newVenue.removeProperty("foursquareId");
newVenue.setProperty("geoCell", GeoCellCalculator.calcCellId(Double.valueOf((String)en.getProperty("lng")), Double.valueOf((String)en.getProperty("lat")),8));
newVenue.setProperty(Venue.TIME_SINCE_LAST_UPDATE, new Date());
updates.add(newVenue);
Venue v = new Venue(newVenue);
//Set timestamp on Venue
en.setProperty("timeOfLastUpdate", now);
oldVenueUpdates.add(en);
}catch(Exception e) {
logger.log(Level.WARNING,"",e);
}
}
done = tuples == 0;
tuples = 0;
if(results.getCursor() != null)
cursor = results.getCursor().toWebSafeString();
else
done = true;
System.out.println("Venue Conversion LOOP updates.. " + updates.size() + " cursor " + cursor);
datastore.put(updates);
datastore.put(oldVenueUpdates);
}
System.out.println("Venue Conversion DONE");
}}));