爬取段子 :
package main
import (
"fmt"
"strconv"
"net/http"
"io"
"regexp"
"strings"
"os"
)
// 使用url 获取当前网页的所有数据
func HttpGetJokePage(url string) string {
var result string
resp, err := http.Get(url)
if err != nil {
panic(err)
}
defer resp.Body.Close()
buf := make([]byte,4096)
for {
n, err := resp.Body.Read(buf)
if n ==0 {
break
}
if err != nil && err != io.EOF {
panic(err)
}
result += string(buf[:n])
}
return result
}
// 封装函数,保存一个笑话的标题和内容
func OneJoke(url string) (title, Content string) {
// 读取一个笑话页面
jokeRet := HttpGetJokePage(url)
// 编译解析正则,获取 一个笑话的title
ret1 := regexp.MustCompile(`<h1>(?s:(.*?))</h1>`)
// 从 页面中 找到第一个 含有<h1> 标记 的字符串,作为title
titleData := ret1.FindAllStringSubmatch(jokeRet,1)
//title := titleData[0][1]
for _, data :=range titleData {
title = data[1]
title = strings.Replace(title," ","", -1)
title = strings.Replace(title,"\t","", -1)
break
}
// 编译解析正则,获取 一个笑话的content
ret2 := regexp.MustCompile(`<div class="content-txt pt10">(?s:(.*?))<a id="prev"`)
ContentData := ret2.FindAllStringSubmatch(jokeRet, -1)
//title := titleData[0][1]
for _, data :=range ContentData {
Content = data[1]
Content = strings.Replace(Content," ","", -1)
Content = strings.Replace(Content,"\t","", -1)
Content = strings.Replace(Content," ","", -1)
//fmt.Println("Content=", Content)
break
}
return
}
// 封装函数 爬取带有10个笑话的页面数据
func Sipder10JokePage(i int, quitchan<- int) {
// 获取 当前页对应的url
url:="https://www.pengfu.com/xiaohua_"+ strconv.Itoa(i) +".html"
// 使用url 获取当前网页的所有数据
result := HttpGetJokePage(url)
// 编译、解析正则表达式,获取 每一个笑话对应的页面的 url(共有10处)
ret := regexp.MustCompile(`<h1 class="dp-b"><a href="(?s:(.*?))"`)
alls := ret.FindAllStringSubmatch(result, -1)
// 创建 标题存储的 切片 titleS、 存储内容的切片contentS
titleS := make([]string,0)
contentS := make([]string,0)
// 遍历 ,循环 10 次
for _, jokeURL :=range alls {
title, content := OneJoke(jokeURL[1])
//fmt.Println("title:", title)
//fmt.Println("content:", content)
// 将每一个title 、content 添加到 对应切片中
titleS = append(titleS, title)
contentS = append(contentS, content)
}
// 保存到文件中
SaveJoke2File(i, titleS, contentS)
quit <- i
}
// 创建函数,保存一个页面的10个笑话,到.txt 文件中
func SaveJoke2File(i int , titleS, contentS []string) {
fileName :="第" + strconv.Itoa(i) +"页.txt"
f, err := os.Create(fileName)
if err != nil {
panic(err)
}
defer f.Close()
len := len(titleS)
for i:=0; i
f.WriteString(titleS[i] +"\r\n")
f.WriteString(contentS[i] +"\r\n")
f.WriteString("-------------------------------------------------------------------------\r\n")
}
}
func DoWork(start, end int) {
quit := make(chan int)
for i:= start; i<= end; i++ {
go Sipder10JokePage(i, quit)
}
for i:= start; i<= end; i++ {
fmt.Printf("第%d页笑话爬取完毕...\n", <-quit)
}
}
func main() {
//1. 提示用户指定 爬取的起始、终止页面
var start, end int
fmt.Print("请输入爬取的起始页(>=1):")
fmt.Scan(&start)
fmt.Print("请输入爬取的终止页(>=start):")
fmt.Scan(&end)
//2. 封装 work 函数
DoWork(start, end)
}
斗鱼图片爬取 :
package main
import (
"fmt"
"strconv"
"net/http"
"io"
"regexp"
"os"
)
func SaveImg(idx int, url string, pagechan int) {
path :="C:/itcast/img/" + strconv.Itoa(idx+1) +".jpg"
f, err := os.Create(path)
if err != nil {
fmt.Println(" http.Get err:", err)
return
}
defer f.Close()
resp, err := http.Get(url)
if err != nil {
fmt.Println(" http.Get err:", err)
return
}
defer resp.Body.Close()
buf := make([]byte,4096)
for {
n, err2 := resp.Body.Read(buf)
if n ==0 {
break
}
if err2 != nil && err2 != io.EOF {
err = err2
return
}
f.Write(buf[:n])
}
page <- idx
}
func main() {
url :="https://www.douyu.com/g_yz"
// 爬取 整个页面,将整个页面全部信息,保存在result
result, err := HttpGet(url)
if err != nil {
fmt.Println("HttpGet err:", err)
return
}
// 解析编译正则
ret := regexp.MustCompile(`data-original="(?s:(.*?))"`)
// 提取每一张图片的url
alls := ret.FindAllStringSubmatch(result, -1)
page := make(chan int)
n := len(alls)
for idx, imgURL :=range alls {
//fmt.Println("imgURL:", imgURL[1])
go SaveImg(idx, imgURL[1], page)
}
for i:=0; i
fmt.Printf("下载第 %d 张图片完成\n", <- page)
}
}
// 获取一个网页所有的内容, result 返回
func HttpGet(url string) (result string, err error) {
resp, err1 := http.Get(url)
if err1 != nil {
err = err1
return
}
defer resp.Body.Close()
buf := make([]byte,4096)
for {
n, err2 := resp.Body.Read(buf)
if n ==0 {
break
}
if err2 != nil && err2 != io.EOF {
err = err2
return
}
result += string(buf[:n])
}
return
}