some important concepts
– using codecs module to write unicode files
– using authentication with web APIs
– using offset when accessing web APIs
#!/usr/bin/env python # -*- coding: utf-8 -*- import json import codecs import requests URL_MAIN = "http://api.nytimes.com/svc/" URL_POPULAR = URL_MAIN + "mostpopular/v2/" API_KEY = { "popular": "", "article": ""} def get_from_file(kind, period): filename = "popular-{0}-{1}.json".format(kind, period) with open(filename, "r") as f: return json.loads(f.read()) def article_overview(kind, period): data = get_from_file(kind, period) titles = [] urls = [] for article in data: section = article["section"] title = article["title"] titles.append({section: title}) if "media" in article: for m in article["media"]: for mm in m["media-metadata"]: if mm["format"] == "Standard Thumbnail": urls.append(mm["url"]) return (titles, urls) def query_site(url, target, offset): if API_KEY["popular"] == "" or API_KEY["article"] == "": print "You need to register for NYTimes Developer account to run this program." print "See Instructor notes for information" return False params = {"api-key": API_KEY[target], "offset": offset} r = requests.get(url, params = params) if r.status_code == requests.codes.ok: return r.json() else: r.raise_for_status() def get_popular(url, kind, days, section="all-sections", offset=0): if days not in [1,7,30]: print "time period can be 1, 7, 30 days only" return False if kind not in ["viewd", "shared", "emailed"]: print "kind can be only one of viewd/shared/emailed" return False url += "most{0}/{1}/{2}.json".format(kind, section, days) data = query_site(url, "popular", offset) return data def save_file(kind, period): data = get_popular(URL_POPULAR, "viewd", 1) num_results = data["num_results"] full_data = [] with codecs.open("popular-{0}-{1}.json".format(kind, period), encoding='utf-8', mode='w') as v: for offset in range(0, num_results, 20): data = get_popular(URL_POPULAR, kind, period, offset=offset) full_data += data["results"] v.write(json.dumps(full_data, indent=2)) def test(): titles, urls = article_overview("viewd", 1) assert len(titles) == 20 assert len(urls) == 30 assert titles[2] == {'Opinion': 'Professors, Wee need you!'} assert urls[20] == 'http://graphics8.nytimes.com/images/2014/02/17/sports/ICEDANCE/ICEDANCE-thumbStandard.jpg' if __name__ == "__main__": test
Python XML
https://wiki.python.org/moin/PythonXml