def main():
url = DOWNLOAD_URL
with codecs.open("qiushis", "wb", encoding="utf-8") as fp:
while url:
html = download_page(url)
qiushis, url = parse_html(html)
for item in qiushis:
fp.write(item.strip().replace("& ", "
") + "
")
if __name__ == "__main__":
main()
import requests
from lxml import html
URL = "https://www.qiushibaike.com"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
}
r = requests.get(URL + "/text/",headers = headers)
tree = html.fromstring(r.text)
text_eles = tree.xpath("//a[@class = "contentHerf"]")
urls_end = [text_ele.attrib["href"] for text_ele in text_eles]
text_urls = [requests.get(URL + text_url,headers = headers) for text_url in urls_end]
trees_next = [html.fromstring(text_url.text) for text_url in text_urls]
texts = [tree_next.xpath("//div[@class = "content"]/text()") for tree_next in trees_next]
text = ["".join(textstr) for textstr in texts ]
file = "duanzi.txt"
with open(file, "w+",encoding = "utf-8") as f:
for textstr in text:
if not textstr == "":
f.write(textstr)
else:
f.close()