1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
|
import requests from bs4 import BeautifulSoup from lxml import etree
def get_html(): """ 针对 《震惊!我修仙秘密被孙女直播曝光》小说脚本 :return: """
url = 'http://www.ywggzy.com/bxwx/28463/4083918.html'
req = requests.get(url) req.encoding = 'utf-8' html = req.text
soup = BeautifulSoup(html, "html.parser")
with open('./老敖小说.txt', 'a', encoding='utf-8') as fp: try: for i in range(10000): content = soup.find('div', id="content")
name = soup.find('h1', class_="title").text
ct = str(content).replace('\n', '').replace('<br/> <br/> ', '\n').replace( '<div class="posterror"><a class="red" href="javascript:postError();">章节错误,点此举报(免注册)</a>,' '举报后维护人员会在两分钟内校正章节内容,请耐心等待,并刷新页面。</div>', '').replace('<br/>', '').replace('<div class="content" ' 'id="content">', '').replace( '</div>', '').replace(' ', '').replace(' ', '').replace('(本章完)', '\n')
print('ct=========', ct) print('\n章节:',name)
fp.write(ct)
con = etree.HTML(html) urls = con.xpath('//*[@id="container"]/div/div/div[2]/div[4]/a[3]/@href')[0] ping = 'http://www.ywggzy.com/bxwx/28463/'
urll = ping + urls
print('222222222222', urll)
req = requests.get(urll) req.encoding = 'utf-8' html = req.text soup = BeautifulSoup(html, "html.parser") except:
print('已爬取完毕!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
if __name__ == '__main__': get_html()
|