Wrangling JSON

some important concepts
– using codecs module to write unicode files
– using authentication with web APIs
– using offset when accessing web APIs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python
# -*- coding: utf-8 -*-
 
import json
import codecs
import requests
 
URL_POPULAR = URL_MAIN + "mostpopular/v2/"
API_KEY = { "popular": "",
            "article": ""}
 
def get_from_file(kind, period):
    filename = "popular-{0}-{1}.json".format(kind, period)
    with open(filename, "r") as f:
        return json.loads(f.read())
 
def article_overview(kind, period):
    data = get_from_file(kind, period)
    titles = []
    urls = []
 
    for article in data:
        section = article["section"]
        title = article["title"]
        titles.append({section: title})
        if "media" in article:
            for m in article["media"]:
                for mm in m["media-metadata"]:
                    if mm["format"] == "Standard Thumbnail":
                        urls.append(mm["url"])
    return (titles, urls)
 
def query_site(url, target, offset):
    if API_KEY["popular"] == "" or API_KEY["article"] == "":
        print "You need to register for NYTimes Developer account to run this program."
        print "See Instructor notes for information"
        return False
    params = {"api-key": API_KEY[target], "offset": offset}
    r = requests.get(url, params = params)
 
    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()
 
def get_popular(url, kind, days, section="all-sections", offset=0):
    if days not in [1,7,30]:
        print "time period can be 1, 7, 30 days only"
        return False
    if kind not in ["viewd", "shared", "emailed"]:
        print "kind can be only one of viewd/shared/emailed"
        return False
 
    url += "most{0}/{1}/{2}.json".format(kind, section, days)
    data = query_site(url, "popular", offset)
 
    return data
 
def save_file(kind, period):
    data = get_popular(URL_POPULAR, "viewd", 1)
    num_results = data["num_results"]
    full_data = []
    with codecs.open("popular-{0}-{1}.json".format(kind, period), encoding='utf-8', mode='w') as v:
        for offset in range(0, num_results, 20):
            data = get_popular(URL_POPULAR, kind, period, offset=offset)
            full_data += data["results"]
 
        v.write(json.dumps(full_data, indent=2))
 
def test():
    titles, urls = article_overview("viewd", 1)
    assert len(titles) == 20
    assert len(urls) == 30
    assert titles[2] == {'Opinion': 'Professors, Wee need you!'}
 
if __name__ == "__main__":
    test

Python XML
https://wiki.python.org/moin/PythonXml