some important concepts
– using codecs module to write unicode files
– using authentication with web APIs
– using offset when accessing web APIs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | #!/usr/bin/env python # -*- coding: utf-8 -*- import json import codecs import requests URL_POPULAR = URL_MAIN + "mostpopular/v2/" API_KEY = { "popular" : "", "article" : ""} def get_from_file(kind, period): filename = "popular-{0}-{1}.json" . format (kind, period) with open (filename, "r" ) as f: return json.loads(f.read()) def article_overview(kind, period): data = get_from_file(kind, period) titles = [] urls = [] for article in data: section = article[ "section" ] title = article[ "title" ] titles.append({section: title}) if "media" in article: for m in article[ "media" ]: for mm in m[ "media-metadata" ]: if mm[ "format" ] = = "Standard Thumbnail" : urls.append(mm[ "url" ]) return (titles, urls) def query_site(url, target, offset): if API_KEY[ "popular" ] = = " " or API_KEY[" article "] == " ": print "You need to register for NYTimes Developer account to run this program." print "See Instructor notes for information" return False params = { "api-key" : API_KEY[target], "offset" : offset} r = requests.get(url, params = params) if r.status_code = = requests.codes.ok: return r.json() else : r.raise_for_status() def get_popular(url, kind, days, section = "all-sections" , offset = 0 ): if days not in [ 1 , 7 , 30 ]: print "time period can be 1, 7, 30 days only" return False if kind not in [ "viewd" , "shared" , "emailed" ]: print "kind can be only one of viewd/shared/emailed" return False url + = "most{0}/{1}/{2}.json" . format (kind, section, days) data = query_site(url, "popular" , offset) return data def save_file(kind, period): data = get_popular(URL_POPULAR, "viewd" , 1 ) num_results = data[ "num_results" ] full_data = [] with codecs. open ( "popular-{0}-{1}.json" . format (kind, period), encoding = 'utf-8' , mode = 'w' ) as v: for offset in range ( 0 , num_results, 20 ): data = get_popular(URL_POPULAR, kind, period, offset = offset) full_data + = data[ "results" ] v.write(json.dumps(full_data, indent = 2 )) def test(): titles, urls = article_overview( "viewd" , 1 ) assert len (titles) = = 20 assert len (urls) = = 30 assert titles[ 2 ] = = { 'Opinion' : 'Professors, Wee need you!' } assert urls[ 20 ] = = 'http://graphics8.nytimes.com/images/2014/02/17/sports/ICEDANCE/ICEDANCE-thumbStandard.jpg' if __name__ = = "__main__" : test |
Python XML
https://wiki.python.org/moin/PythonXml