辅导Python getHTMLText 、讲解BeautifulSoup python

- 首页 >> Python编程

# -*- coding: utf-8 -*-import requestsfrom bs4 import BeautifulSoupimport jiebadef getHTMLText(url):    try:        r = requests.get(url, timeout = 30)        r.raise_for_status()        #r.encoding = 'utf-8'        return r.text    except:        return ""def getContent(url):    html = getHTMLText(url)    # print(html)    soup = BeautifulSoup(html, "html.parser")    title = soup.select("div.hd > h1")    print(title[0].get_text())    time = soup.select("div.a_Info > span.a_time")    print(time[0].string)    author = soup.select("div.qq_articleFt > div.qq_toolWrap > div.qq_editor")    print(author[0].get_text())    paras = soup.select("div.Cnt-Main-Article-QQ > p.text")    for para in paras:        if len(para) > 0:            print(para.get_text())            print()    fo = open("news_text.txt", "w+",encoding='GBK')    fo.writelines(title[0].get_text() + "\n")    fo.writelines(time[0].get_text() + "\n")    for para in paras:        if len(para) > 0:            fo.writelines(para.get_text() + "\n\n")    fo.writelines(author[0].get_text() + '\n')    fo.close()def fenci():    book = "./news_text.txt"    txt = open(book, "r", encoding='GBK').read()    ls = []    words = jieba.lcut(txt)    counts = {}    for word in words:        ls.append(word)        if len(word) == 1:            continue        else:            counts[word] = counts.get(word, 0) + 1    items = list(counts.items())    items.sort(key=lambda x: x[1], reverse=True)    lk = open('result.txt', 'w+',encoding='GBK')    for i in range(15):        word, count = items[i]        lk.writelines(str(word) + ':' + str(count) + "\n")        print("{:<10}{:>5}".format(word, count))    lk.close()def main():    url = "https://news.qq.com/a/20180515/023424.htm"    getContent(url)    fenci()main()

站长地图