I have some code that processes tweets, about 5 million a day, in realtime. They are currently stored in mongodb and also posted on various celery/rabbitmq work queues. The average message size is 5524, so encoding and decoding these messages is an issue.
Using the following test code below.
Using the following test code below.
Standard Tweet message Encode/Decode with python built-in json package.
message sizeTest | Msg Size | De-serialize | Obj Size | Serialize cjosn, bson, ujson | Storage | Storoge Cost | |||||||||||||||||||||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Empty object | 41 | 14.790, 37.565, 0.970 | 54 | 6.341, 41.856, 1.249 | 2050Mb | 0.21 | {} | ||||||||||||||||||||||||||||||||||||||||||||||||||||
Empty list | 41 | 15.069, 38.021, 1.005 | 54 | 6.675, 41.475, 1.400 | 2050Mb | 0.21 | [] | ||||||||||||||||||||||||||||||||||||||||||||||||||||
Object of objects | 843 | 107.750, 145.440, 25.525 | 3226 | 63.555, 828.235, 28.051 | 42150Mb | 4.21 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ||||||||||||||||||||||||||||||||||||||||||||||||||||
List of lists | 563 | 58.805, 81.950, 16.960 | 104 | 43.426, 815.965, 18.311 | 28150Mb | 2.81 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ||||||||||||||||||||||||||||||||||||||||||||||||||||
Object with only tweet id | 93 | 25.030, 53.360, 2.280 | 422 | 23.570, 83.445, 3.295 | 4650Mb | 0.47 | |||||||||||||||||||||||||||||||||||||||||||||||||||||
Full tweet message | 4386 | 697.221, 867.780, 188.560 | 12606 | 360.290, 5847.335, 201.610 | 219300Mb | 21.93 | https://gist.github.com/thanos/adf7e20b5f00551a38a8 | ||||||||||||||||||||||||||||||||||||||||||||||||||||
Message with string payload | 4899 | 446.661, 489.455, 82.361 | 9170 | 104.254, 327.455, 92.460 | 244950Mb | 24.50 | |||||||||||||||||||||||||||||||||||||||||||||||||||||
Object where the field names are codes | 4396 | 422.220, 456.184, 75.035 | 7880 | 73.345, 231.071, 81.570 | 219800Mb | 21.98 | |||||||||||||||||||||||||||||||||||||||||||||||||||||
Only fields of interest | 1911 | 285.840, 375.440, 74.100 | 7290 | 147.895, 2252.359, 78.495 | 95550Mb | 9.55 | https://gist.github.com/thanos/adf7e20b5f00551a38a8 | ||||||||||||||||||||||||||||||||||||||||||||||||||||
Only fields of interest, keys encoded | 1660 | 288.585, 378.330, 67.400 | 7290 | 143.045, 2246.330, 70.940 | 83000Mb | 8.30 | |||||||||||||||||||||||||||||||||||||||||||||||||||||
Denormalized Tweet | 4707 | 603.030, 716.414, 186.300 | 9246 | 350.465, 5802.315, 198.150 | 235350Mb | 23.54 | |||||||||||||||||||||||||||||||||||||||||||||||||||||
Denormalized Tweet with only needed fields | 2158 | 255.200, 318.004, 77.040 | 6378 | 142.004, 2165.425, 71.850 | 107900Mb | 10.79 | |||||||||||||||||||||||||||||||||||||||||||||||||||||
Denormalized Tweet with only needed fields, keys encoded | 1734 | 242.375, 305.970, 65.689 | 6378 | 133.420, 2165.465, 68.730 | 86700Mb | 8.67 | |||||||||||||||||||||||||||||||||||||||||||||||||||||
Possilble Candidate | 1911 | 291.691, 380.090, 74.741 | 7290 | 156.535, 2258.445, 77.696 | 95550Mb | 9.55 |
Source code of test script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env python | |
# json_tests.py | |
# | |
# | |
# Created by thanos vassilakis on 12/26/13. | |
# | |
from sys import getsizeof, stderr | |
import requests | |
import json,sys | |
def total_size(o): | |
size = getsizeof(o) | |
if type(o) is type({}): | |
size += sum([total_size(v) for v in o.values()]) | |
elif type(o) is type({}): | |
size += sum([total_size(v) for v in o]) | |
return size | |
def codeKeys(obj, path=''): | |
if type(obj) == type({}): | |
return dict([("%s.%X"% (path, i) if path else "%X"% i, codeKeys(v,"%s.%X"% (path, i) if path else "%X"% i)) for i,v in enumerate(obj.values())]) | |
return obj | |
def pathKeys(obj, path=''): | |
if type(obj) == type({}): | |
return dict([("%s.%s"% (path, k) if path else k, pathKeys(obj[k],"%s.%s"% (path, k) if path else k)) for k in obj]) | |
return obj | |
def denormalize(obj, new_obj=None): | |
if not new_obj: | |
new_obj = {} | |
for k in obj: | |
if type(obj[k]) == type({}): | |
new_obj.update(denormalize(obj[k], new_obj)) | |
else: | |
new_obj[k] = obj[k] | |
return new_obj | |
empty_obj="{}" | |
empty_list="[]" | |
dict_dict = requests.get("https://gist.github.com/thanos/8153867/raw/dict_dict.json").json() | |
list_list = requests.get("https://gist.github.com/thanos/8153937/raw/list_list.json").json() | |
tweet = requests.get("https://gist.github.com/thanos/8153701/raw/gnip_record.json").json() | |
tweet_id_only= dict(id=tweet['id']) | |
tweet_payload =dict(id=tweet['id'], payload=json.dumps(tweet)) | |
tweet_coded = json.dumps(codeKeys(tweet)) | |
tweet_with_needed_fields=requests.get("https://gist.github.com/thanos/adf7e20b5f00551a38a8/raw/gnip_record_with_needed_fields.json").json() | |
tweet_foi_coded = codeKeys(tweet_with_needed_fields) | |
denormalized_tweet = denormalize(pathKeys( tweet )) | |
denormalized_tweet_foi = denormalize(pathKeys(tweet_with_needed_fields)) | |
denormalized_tweet_foi_coded = denormalize(pathKeys(tweet_foi_coded)) | |
if __name__ == '__main__': | |
import timeit,pprint, time | |
tests =[ | |
["Empty object", empty_obj, "{}"], | |
["Empty list", empty_list, "[]"], | |
["Object of objects", dict_dict, '<script src="https://gist.github.com/thanos/8153867.js"></script>'], | |
["List of lists", list_list, '<script src="https://gist.github.com/thanos/8153937.js"></script>'], | |
["Object with only tweet id", tweet_id_only, ""], | |
["Full tweet message", tweet, '<a href="https://gist.github.com/thanos/adf7e20b5f00551a38a8">https://gist.github.com/thanos/adf7e20b5f00551a38a8</a>'], | |
["Message with string payload", tweet_payload,''], | |
["Object where the field names are codes", tweet_coded,''], | |
["Only fields of interest", tweet_with_needed_fields, '<a href="https://gist.github.com/thanos/adf7e20b5f00551a38a8">https://gist.github.com/thanos/adf7e20b5f00551a38a8</a>' ], | |
["Only fields of interest, keys encoded", tweet_foi_coded,''], | |
["Denormalized Tweet", denormalized_tweet,''], | |
["Denormalized Tweet with only needed fields", denormalized_tweet_foi,''], | |
["Denormalized Tweet with only needed fields, keys encoded", denormalized_tweet_foi_coded,''], | |
] | |
for title, test, example in tests: | |
test_msg = json.dumps(test) | |
msg_size = sys.getsizeof(test_msg) | |
tick = time.time() | |
for i in xrange(100000): | |
json.loads(test_msg) | |
deserialize = time.time() - tick | |
test_obj = json.loads(test_msg) | |
obj_size = total_size(test_obj) | |
#pprint.pprint(test_obj) | |
tick = time.time() | |
for i in xrange(100000): | |
json.dumps(test_obj) | |
serialize = time.time() - tick | |
storage = msg_size * 5000000 | |
storage_cost = storage * 1e-10 | |
print '<tr bgcolor="white"><td>%s</td><td>%d</td><td>%.3f</td><td>%d</td><td>%.3f</td><td>%dMb</td><td>%.2f</td><td><tt>%s</tt></td></tr>' % (title, msg_size, deserialize, obj_size, serialize, storage/(10**6), storage_cost, example) |
Empty object
{}
Empty list
[]
Object of objects
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"id": "tag:search.twitter.com,2005:403224522679009280", | |
"actor": { | |
"preferredUsername": "LandenEhlers", | |
"displayName": "Landen Ehlers", | |
"followersCount": 15, | |
"twitterTimeZone": null, | |
"image": "https://pbs.twimg.com/profile_images/378800000646150423/83090ccb95a60def923c674e7bd002a0_normal.jpeg", | |
"verified": false, | |
"statusesCount": 24, | |
"summary": "Senior Construction Science student at Texas A&M University. Barefoot Waterskiing National Champion. Vice President of the Texas", | |
"utcOffset": null, | |
"link": "http://www.twitter.com/LandenEhlers", | |
"location": { | |
"displayName": "College Station, TX", | |
"objectType": "place" | |
}, | |
"favoritesCount": 2, | |
"friendsCount": 65, | |
"listedCount": 1, | |
"postedTime": "2013-09-11T23:52:03.000Z", | |
"id": "id:twitter.com:1855784545", | |
"objectType": "person" | |
}, | |
"objectType": "activity" | |
} |
List of lists
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[ | |
"tag:search.twitter.com,2005:403224522679009280", | |
"activity", | |
[ | |
"person", | |
"id:twitter.com:1855784545", | |
"http:\\/\\/www.twitter.com\\/LandenEhlers", | |
"Landen Ehlers", | |
"2013-09-11T23:52:03.000Z", | |
"https:\\/\\/pbs.twimg.com\\/profile_images\\/378800000646150423\\/83090ccb95a60def923c674e7bd002a0_normal.jpeg", | |
"Senior Construction Science student at Texas A&M University. Barefoot Waterskiing National Champion. Vice President of the Texas", | |
65, | |
15, | |
1, | |
24, | |
null, | |
false, | |
null, | |
"LandenEhlers", | |
[ | |
"place", | |
"College Station, TX" | |
], | |
2 | |
] | |
] |
Object with only tweet id
Full tweet message
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"body": "Enjoyed our half price chicken and wawfuls today! @tamusportclubs @SullysGrill @TAMUWaterski #SCPartnerday http://t.co/XRsVqYy9Zo", | |
"retweetCount": 0, | |
"generator": { | |
"link": "http://twitter.com/download/android", | |
"displayName": "Twitter for Android" | |
}, | |
"twitter_filter_level": "medium", | |
"geo": { | |
"type": "Point", | |
"coordinates": [ | |
30.622496, | |
-96.3283527 | |
] | |
}, | |
"favoritesCount": 0, | |
"object": { | |
"postedTime": "2013-11-20T18:13:12.000Z", | |
"summary": "Enjoyed our half price chicken and wawfuls today! @tamusportclubs @SullysGrill @TAMUWaterski #SCPartnerday http://t.co/XRsVqYy9Zo", | |
"link": "http://twitter.com/LandenEhlers/statuses/403224522679009280", | |
"id": "object:search.twitter.com,2005:403224522679009280", | |
"objectType": "note" | |
}, | |
"actor": { | |
"preferredUsername": "LandenEhlers", | |
"displayName": "Landen Ehlers", | |
"links": [ | |
{ | |
"href": null, | |
"rel": "me" | |
} | |
], | |
"twitterTimeZone": null, | |
"image": "https://pbs.twimg.com/profile_images/378800000646150423/83090ccb95a60def923c674e7bd002a0_normal.jpeg", | |
"verified": false, | |
"location": { | |
"displayName": "College Station, TX", | |
"objectType": "place" | |
}, | |
"statusesCount": 24, | |
"summary": "Senior Construction Science student at Texas A&M University. Barefoot Waterskiing National Champion. Vice President of the Texas A&M Waterski Team.", | |
"languages": [ | |
"en" | |
], | |
"utcOffset": null, | |
"link": "http://www.twitter.com/LandenEhlers", | |
"followersCount": 15, | |
"favoritesCount": 2, | |
"friendsCount": 65, | |
"listedCount": 1, | |
"postedTime": "2013-09-11T23:52:03.000Z", | |
"id": "id:twitter.com:1855784545", | |
"objectType": "person" | |
}, | |
"twitter_lang": "en", | |
"twitter_entities": { | |
"symbols": [], | |
"user_mentions": [ | |
{ | |
"id": 338528272, | |
"indices": [ | |
50, | |
65 | |
], | |
"id_str": "338528272", | |
"screen_name": "tamusportclubs", | |
"name": "TAMU Sport Clubs" | |
}, | |
{ | |
"id": 325152462, | |
"indices": [ | |
66, | |
78 | |
], | |
"id_str": "325152462", | |
"screen_name": "SullysGrill", | |
"name": "Sully's Sports Grill" | |
}, | |
{ | |
"id": 432729855, | |
"indices": [ | |
79, | |
92 | |
], | |
"id_str": "432729855", | |
"screen_name": "TAMUWaterski", | |
"name": "TAMU Waterski" | |
} | |
], | |
"hashtags": [ | |
{ | |
"indices": [ | |
93, | |
106 | |
], | |
"text": "SCPartnerday" | |
} | |
], | |
"urls": [], | |
"media": [ | |
{ | |
"expanded_url": "http://twitter.com/LandenEhlers/status/403224522679009280/photo/1", | |
"display_url": "pic.twitter.com/XRsVqYy9Zo", | |
"url": "http://t.co/XRsVqYy9Zo", | |
"media_url_https": "https://pbs.twimg.com/media/BZiKkRdCEAAFC-j.jpg", | |
"id_str": "403224522414755840", | |
"sizes": { | |
"small": { | |
"h": 192, | |
"resize": "fit", | |
"w": 340 | |
}, | |
"large": { | |
"h": 579, | |
"resize": "fit", | |
"w": 1023 | |
}, | |
"medium": { | |
"h": 339, | |
"resize": "fit", | |
"w": 600 | |
}, | |
"thumb": { | |
"h": 150, | |
"resize": "crop", | |
"w": 150 | |
} | |
}, | |
"indices": [ | |
107, | |
129 | |
], | |
"type": "photo", | |
"id": 4.0322452241476e+17, | |
"media_url": "http://pbs.twimg.com/media/BZiKkRdCEAAFC-j.jpg" | |
} | |
] | |
}, | |
"verb": "post", | |
"link": "http://twitter.com/LandenEhlers/statuses/403224522679009280", | |
"location": { | |
"displayName": "College Station, TX", | |
"name": "College Station", | |
"link": "https://api.twitter.com/1.1/geo/id/85128f80a57c03ad.json", | |
"twitter_country_code": "US", | |
"country_code": "United States", | |
"geo": { | |
"type": "Polygon", | |
"coordinates": [ | |
[ | |
[ | |
-96.386719, | |
30.534473 | |
], | |
[ | |
-96.386719, | |
30.658246 | |
], | |
[ | |
-96.204688, | |
30.658246 | |
], | |
[ | |
-96.204688, | |
30.534473 | |
] | |
] | |
] | |
}, | |
"objectType": "place" | |
}, | |
"provider": { | |
"link": "http://www.twitter.com", | |
"displayName": "Twitter", | |
"objectType": "service" | |
}, | |
"postedTime": "2013-11-20T18:13:12.000Z", | |
"id": "tag:search.twitter.com,2005:403224522679009280", | |
"gnip": { | |
"matching_rules": [ | |
{ | |
"tag": null, | |
"value": "has:geo has:mentions has:links has:hashtags has:profile_geo" | |
} | |
], | |
"profileLocations": [ | |
{ | |
"displayName": "College Station, Texas, United States", | |
"address": { | |
"country": "United States", | |
"region": "Texas", | |
"subRegion": "Brazos County", | |
"countryCode": "US", | |
"locality": "College Station" | |
}, | |
"geo": { | |
"type": "point", | |
"coordinates": [ | |
-96.33441, | |
30.62798 | |
] | |
}, | |
"objectType": "place" | |
} | |
], | |
"language": { | |
"value": "en" | |
}, | |
"klout_score": 21, | |
"urls": [ | |
{ | |
"url": "http://t.co/XRsVqYy9Zo", | |
"expanded_status": 200, | |
"expanded_url": "http://twitter.com/LandenEhlers/status/403224522679009280/photo/1" | |
} | |
], | |
"klout_profile": { | |
"link": "http://klout.com/user/id/289637767579957066", | |
"topics": [ | |
{ | |
"link": "http://klout.com/topic/id/10000000000000010000", | |
"displayName": "Ricky Carmichael", | |
"klout_topic_id": "10000000000000010000" | |
}, | |
{ | |
"link": "http://klout.com/topic/id/1297", | |
"displayName": "Rock Music", | |
"klout_topic_id": "1297" | |
} | |
], | |
"klout_user_id": "289637767579957066" | |
} | |
}, | |
"objectType": "activity" | |
} |
Message with string payload
Object where the field names are codes
Only fields of interest
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"body": "Enjoyed our half price chicken and wawfuls today! @tamusportclubs @SullysGrill @TAMUWaterski #SCPartnerday http://t.co/XRsVqYy9Zo", | |
"retweetCount": 0, | |
"generator": "Twitter for Android", | |
"geo": [ | |
30.622496, | |
-96.3283527 | |
], | |
"favoritesCount": 0, | |
"actor": { | |
"preferredUsername": "LandenEhlers", | |
"friendsCount": 65, | |
"followersCount": 15, | |
"image": "378800000646150423/83090ccb95a60def923c674e7bd002a0_normal.jpeg", | |
"verified": false, | |
"statusesCount": 24, | |
"summary": "Senior Construction Science student at Texas A&M University. Barefoot Waterskiing National Champion. Vice President of the Texas A&M Waterski Team.", | |
"languages": [ | |
"en" | |
], | |
"location": { | |
"displayName": "College Station, TX" | |
}, | |
"favoritesCount": 2, | |
"displayName": "Landen Ehlers", | |
"listedCount": 1, | |
"id": "1855784545" | |
}, | |
"twitter_lang": "en", | |
"twitter_entities": { | |
"user_mentions": [ | |
{ | |
"id": 338528272, | |
"screen_name": "tamusportclubs", | |
"name": "TAMU Sport Clubs" | |
}, | |
{ | |
"id": 325152462, | |
"screen_name": "SullysGrill", | |
"name": "Sully's Sports Grill" | |
}, | |
{ | |
"id": 432729855, | |
"screen_name": "TAMUWaterski", | |
"name": "TAMU Waterski" | |
} | |
], | |
"hashtags": [ | |
"SCPartnerday" | |
] | |
}, | |
"provider": "Twitter", | |
"postedTime": "2013-11-20T18:13:12.000Z", | |
"id": "403224522679009280", | |
"gnip": { | |
"klout_profile": { | |
"topics": [ | |
{ | |
"displayName": "Ricky Carmichael", | |
"klout_topic_id": "10000000000000010000" | |
}, | |
{ | |
"displayName": "Rock Music", | |
"klout_topic_id": "1297" | |
} | |
], | |
"klout_user_id": "289637767579957066", | |
"address": { | |
"displayName": "College Station, Texas, United States", | |
"countryCode": "US", | |
"locality": "College Station", | |
"country": "United States", | |
"region": "Texas", | |
"subRegion": "Brazos County" | |
} | |
}, | |
"klout_score": 21, | |
"matching_rules": [ | |
{ | |
"tag": null, | |
"value": "has:geo has:mentions has:links has:hashtags has:profile_geo" | |
} | |
], | |
"urls": [ | |
{ | |
"url": "http://t.co/XRsVqYy9Zo", | |
"expanded_url": "http://twitter.com/LandenEhlers/status/403224522679009280/photo/1" | |
} | |
] | |
} | |
} |