一,通过输入的网址获取网页文本内容
url2txt.py
# 2019/05/12
# coding: UTF-8
# 告诉解释器该PY程序是utf-8编码的,源程序中可以有中文
import requests # 抓取网页中html源代码
import csv # 将数据写入到csv文件中
import random # 取随机数
import time # 时间相关操作
import socket # 异常处理
import http.client # 异常处理
# import urllib.request # 另一种抓取网页html源代码的方法,但没有requests方便
from bs4 import BeautifulSoup # 代替正则式取源码中相应标签中的内容
import sys,re
class Tool:
removeImg = re.compile('<img.*?>| {7}|')
removeAddr = re.compile('<a.*?>|</a>')
replaceLine = re.compile('<tr>|<div>|</div>|</p>')
replaceTD= re.compile('<td>')
replacePara = re.compile('<p.*?>')
replaceBR = re.compile('<br><br>|<br>')
removeExtraTag = re.compile('<.*?>')
def replace(self,x):
x = re.sub(self.removeImg,"",x)
x = re.sub(self.removeAddr,"",x)
x = re.sub(self.replaceLine,"\n",x)
x = re.sub(self.replaceTD,"\t",x)
x = re.sub(self.replacePara,"\n ",x)
x = re.sub(self.replaceBR,"\n",x)
x = re.sub(self.removeExtraTag,"",x)
return x.strip()
tool=Tool()
path = sys.path[0]
def get_content(url , data = None):
header={ # header是requests.get的一个参数,目的是模拟浏览器访问,可由chrome的开发者工具获得(F12)
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
}
timeout = random.choice(range(80, 180))
# 设定一个超时时间,取随机数是为了防止被网站认定为网络爬虫。
while True:
try:
rep = requests.get(url,headers = header,timeout = timeout) # 获取网页的源代码
rep.encoding = 'utf-8' # 将源代码的编码格式改为utf-8
break
except socket.timeout as e:
print( '3:', e)
time.sleep(random.choice(range(8,15)))
except socket.error as e:
print( '4:', e)
time.sleep(random.choice(range(20, 60)))
except http.client.BadStatusLine as e:
print( '5:', e)
time.sleep(random.choice(range(30, 80)))
except http.client.IncompleteRead as e:
print( '6:', e)
time.sleep(random.choice(range(5, 15)))
return rep.text # 返回异常
# return html_text
def get_data(html_text):
final = []
bs = BeautifulSoup(html_text, "html.parser") # 创建BeautifulSoup对象
# bs = BeautifulSoup(html_text.strip(),'html.parser')
final = str(bs)
return final
# 写txt文件
def write_text(data,path,name):
with open(path+'\\'+name+'.txt','w',errors='ignore',newline='',encoding='utf_8_sig') as f:
f.write(data)
# if __name__ == '__main__':
def url2txt(url,path,name):
# url ='https://zhuanlan.zhihu.com/p/61617200' # 页面
html = get_content(url)
result = get_data(html)
result = tool.replace(result).replace('\n','')
write_text(result,path,name)
return result
# url2txt('https://zhuanlan.zhihu.com/p/61617200',path,'txt')