しょっぱな躓いた。
from bs4 import BeautifulSoup
from urllib.request import
from urllib.parse import
from os import makedirs
import os.path, time, re
proc_files = {}
def enum_links(html, base):
soup = BeautifulSoup(html, "html.parser")
links = soup.select("link[rel='stylesheet']")
links += soup.select("a[href]")
result = []
for a in links:
href = a.attrs['href']
url = urljoin(base, href)
result.append(url)
return result
def download_file(url):
o = urlparse(url)
savepath = "./" + o.netloc + o.path
if re.search(r"/$", savepath):
savepath += "index.html"
savedir = os.path.dirname(savepath)
if os.path.exists(savepath): return savepath
if not os.path.exists(savedir):
print("mkdir=", savedir)
makedirs(savedir)
try:
print("download=", url)
urlretrive(url, savepath)
time.sleep(1)
return savepath
except:
print("ダウンロード失敗:", url)
return None
def analize_html(url, root_url):
savepath = download_file(url)
if savepath is None: return
if savepath in proc_files: return
proc_files[savepath] = True
print("analize_html=", url)
html = open(savepath, "r", encoding="utf-8").read()
links = enum_links(html, url)
for link_url in links:
if link_url.find(root_url) != 0:
if not re.search(r".css$", link_url): continue
if re.search(r".(html|html)$", link_url):
analize_html(link_url, root_url)
continue
download_file(link_url)
if __name__ == "__main__":
url = "http://docs.python.jp/3.5/library"
analize=html(url, url)
はい?
[vagrant@localhost python]$ python3 app.py
File “app.py”, line 2
from urllib.request import
^
SyntaxError: invalid syntax
≧urllib.request モジュールは基本的な認証、暗号化認証、リダイレクション、Cookie、その他の介在する複雑なアクセス環境において (大抵は HTTP で) URL を開くための関数とクラスを定義します。
from urllib.request importって書き方がおかしい気がするが。。