package fetcher
import (
"bufio"
"fmt"
"golang.org/x/net/html/charset"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"
"io/ioutil"
"log"
"net/http"
"time"
"zdcrawl/config"
)
var (
rateLimiter = time.Tick(
time.Second / config.Qps)
verboseLogging = config.VerboseLogger
)
func SetVerboseLogging() {
verboseLogging = true
}
func Fetch(url string) ([]byte, error) {
<-rateLimiter
if verboseLogging {
log.Printf("Fetching url %s", url)
}
client := &http.Client{}
/*
说明:此处如果采用resp := http.Get(url), 则会产生 wrong status code: 418 的错误
*/
request, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
request.Header.Add("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36")
resp, _ := client.Do(request)
defer resp.Body.Close()
log.Printf("Fetching url %s code:%d\n", url, resp.StatusCode)
if resp.StatusCode != http.StatusOK {
return nil,
fmt.Errorf("wrong status code: %d",
resp.StatusCode)
}
bodyReader := bufio.NewReader(resp.Body)
e := determineEncoding(bodyReader)
utf8Reader := transform.NewReader(bodyReader,
e.NewDecoder())
return ioutil.ReadAll(utf8Reader)
}
func determineEncoding(
r *bufio.Reader) encoding.Encoding {
bytes, err := r.Peek(1024)
if err != nil {
log.Printf("Fetcher error: %v", err)
return unicode.UTF8
}
e, _, _ := charset.DetermineEncoding(
bytes, "")
return e
}