Find query example where all documents with element named topic are exactly "rrd/srt" are send to collection subset
db.mqtt.find({topic:"rrd/srt"}).forEach(function(doc){
   db.subset.insert(doc);
});
Aggregated query grouping all tweets by $user.screen_name with count and sorted descending are exported to collection twitterars
db.tweets.aggregate([{ $group: {_id: '$user.screen_name', count: {$sum: 1}}},{$sort: {count: -1}],{allowDiskUse: true},{$out: "twitterars"});})

or:

db.tweets.aggregate([{ $group: { _id: ‘$user.screen_name’, count: { $sum: 1 } } }, { $sort: { count: -1 },{ $out: “twtUsers” }], {allowDiskUse: true})});

Note the clause {allowDiskUse: true} indicating that if needed query is allowed to use disk.

Following query will count occurance of users:

db.tweets.aggregate([
{ $group: {
_id: ‘$user.screen_name’,
count: {$sum: 1}
}},

{$sort: {
count: -1
}}
], {allowDiskUse: true});

Note the option allowDiskUse is needed if error:

{
“code” : 16819,
“errmsg” : “Sort exceeded memory limit of 104857600 bytes, but did not opt in to external sorting. Aborting operation. Pass allowDiskUse:true to opt in.”,
“message” : “Sort exceeded memory limit of 104857600 bytes, but did not opt in to external sorting. Aborting operation. Pass allowDiskUse:true to opt in.”,
“name” : “MongoError”,
“ok” : 0
}

Optional output result to collection:

db.tweets.aggregate([{ $group: {_id: ‘$user.text’, count: {$sum: 1}}},{$sort: {count: -1},{$out: “texts”}], {allowDiskUse: true});

 

Copying selection where field date == 20120105 to new collection subset. If subset not exists then it will be created.

db.my_collection.find({date:"20120105"}).forEach(function(doc){
   db.subset.insert(doc);
});
When documents are copied to another collection remove them from source with:
db.inventory.remove( {date:"20120105"} )

Source: http://stackoverflow.com/questions/6452021/getting-timestamp-from-mongodb-id

db.tweets.find().sort({_id:-1}).limit(1).forEach(function (doc){ print(doc._id.getTimestamp()) })

will return timestamp of most recent record in following format:

Tue May 03 2016 08:29:06 GMT+0200 (CEST)

Index Momngodb fields to accelerate search:

db.tweets.createIndex( { text: “text” } )
db.tweets.find({$text: { $search: “gouda”},{_id:1, text:1}})

Note that search key word is case insensitive.

Presume the tweets are in database twitter and collection tweets

var map = function() {
var text = this.text;
if (text) {
// quick lowercase to normalize per your requirements
text = text.toLowerCase().split(” “);
for (var i = text.length – 1; i >= 0; i–) {
// might want to remove punctuation, etc. here
if (text[i]) { // make sure there’s something
emit(text[i], 1); // store a 1 for each word
}
}
}
};
var reduce = function( key, values ) {
var count = 0;
values.forEach(function(v) {
count +=v;
});
return count;
}
db.tweets.mapReduce(map, reduce, {out: “word_count”})
db.word_count.find().sort({value:-1})

The query above will produce new collection word_count with following contents:

{
“_id” : “rt”,
“value” : 43335
},

/* 2 */
{
“_id” : “in”,
“value” : 36810
},

/* 3 */
{
“_id” : “the”,
“value” : 27765
},

/* 4 */
{
“_id” : “to”,
“value” : 20724
},

 

Other useful queries

db.tweets.find()
db.tweets.aggregate([{ $group: {_id: ‘$place’, count: {$sum: 1}}},]);
db.tweets.aggregate([{ $group: {_id: ‘$place’, count: {$sum: 1}}},{$sort: {count: -1}}]);
db.tweets.aggregate([{$unwind: ‘$entities.hashtags’}, { $group: {_id: ‘$entities.hashtags.text’, tagCount: {$sum: 1} }}, { $sort: { tagCount: -1 }}
db.tweets.aggregate({$unwind: ‘$entities.hashtags’}, { $group: {_id: ‘$entities.hashtags.text’, tagCount: {$sum: 1} }}, { $sort: { tagCount: -1 }},
db.tweets.aggregate({ $group: {_id: ‘$text’, count: {$sum: 1}},{$sort:{count:-1}}})
db.tweets.find({text:/Amsterdam/},{_id:0,text:1}})

 

OpenRefine (formerly Google Refine) is a powerful tool for working with messy data: cleaning it; transforming it from one format into another; and extending it with web services and external data.

http://openrefine.org/download.html

If cell A2 contains a UNIX timestamp 1459349774, use this formula

=A2/86400+25569

will return a number. Then format that cell as a date, DD/MM/YYYY HH:MM:SS will return date time.

03/30/2016 14:56:14

 

http://scholarcommons.usf.edu/cgi/viewcontent.cgi?article=1038&context=numeracy

Wine Quality = 12.145 + 0.00117 winter rainfall + 0.0614 average growing season temperature – 0.00368 harvest rainfall, seems almost comical