内网有个网页用了HTTP基本认证机制,想用gocolly爬取,不知道怎么登录,只好研究HTTP基本认证机制

参考这里:https://www.jb51.net/article/89070.htm

下面开始参考作者dotcoo了:-)

看了<<http权威指南>>第12章HTTP基本认证机制(本站下载地址://www.jb51.net/books/93254.html),感觉讲的蛮详细的,写了一个小小例子测试.

请求响应过程:

==>
GET /hello HTTP/1.1
Host: 127.0.0.1:12345
<==
HTTP/1.1 401 Unauthorized
WWW-Authenticate: Basic realm="Dotcoo User Login"
==>
GET /hello HTTP/1.1
Host: 127.0.0.1:12345
Authorization: Basic YWRtaW46YWRtaW5wd2Q=
<==
HTTP/1.1 200 OK
Content-Type: text/plain; charset=utf-8

golang HTTP基本认证机制的实现代码

package main
import ("fmt""io""net/http""log""encoding/base64""strings"
)
// hello world, the web server
func HelloServer(w http.ResponseWriter, req *http.Request) {auth := req.Header.Get("Authorization")if auth == "" {w.Header().Set("WWW-Authenticate", `Basic realm="Dotcoo User Login"`)w.WriteHeader(http.StatusUnauthorized)return}fmt.Println(auth)auths := strings.SplitN(auth, " ", 2)if len(auths) != 2 {fmt.Println("error")return}authMethod := auths[0]authB64 := auths[1]switch authMethod {case "Basic":authstr, err := base64.StdEncoding.DecodeString(authB64)if err != nil {fmt.Println(err)io.WriteString(w, "Unauthorized!\n")return}fmt.Println(string(authstr))userPwd := strings.SplitN(string(authstr), ":", 2)if len(userPwd) != 2 {fmt.Println("error")return}username := userPwd[0]password := userPwd[1]fmt.Println("Username:", username)fmt.Println("Password:", password)fmt.Println()default:fmt.Println("error")return}io.WriteString(w, "hello, world!\n")
}
func main() {http.HandleFunc("/hello", HelloServer)err := http.ListenAndServe(":8000", nil)if err != nil {log.Fatal("ListenAndServe: ", err)}
}

试验了上面的例子后,基本明白了HTTP基本认证的过程。但是怎么用gocolly访问呢?

参考:https://stackoverflow.com/questions/50576248/using-colly-framework-i-cant-login-to-the-evernote-account

但是答复者Matías Insaurralde提供的模拟浏览器访问的例子编译不通过,不明白其中的hptsKey的意思。代码放在下面供参考(可跳过):

package evernoteimport ("bytes""errors""fmt""io/ioutil""net/http""net/http/cookiejar""net/url""regexp""strings"
)const (evernoteLoginURL = "https://www.evernote.com/Login.action"
)var (evernoteJSParamsExpr = regexp.MustCompile(`document.getElementById\("(.*)"\).value = "(.*)"`)evernoteRedirectExpr = regexp.MustCompile(`Redirecting to <a href="(.*)">`)errNoMatches   = errors.New("No matches")errRedirectURL = errors.New("Redirect URL not found")
)// EvernoteClient wraps all methods required to interact with the website.
type EvernoteClient struct {Username   stringPassword   stringhttpClient *http.Client// These parameters persist during the login process:hpts  stringhptsh string
}// NewEvernoteClient initializes a new Evernote client.
func NewEvernoteClient(username, password string) *EvernoteClient {// Allocate a new cookie jar to mimic the browser behavior:cookieJar, _ := cookiejar.New(nil)// Fill up basic data:c := &EvernoteClient{Username: username,Password: password,}// When initializing the http.Client, copy default values from http.DefaultClient// Pass a pointer to the cookie jar that was created earlier:c.httpClient = &http.Client{Transport:     http.DefaultTransport,CheckRedirect: http.DefaultClient.CheckRedirect,Jar:           cookieJar,Timeout:       http.DefaultClient.Timeout,}return c
}func (e *EvernoteClient) extractJSParams(body []byte) (err error) {matches := evernoteJSParamsExpr.FindAllSubmatch(body, -1)if len(matches) == 0 {return errNoMatches}for _, submatches := range matches {if len(submatches) < 3 {err = errNoMatchesbreak}key := submatches[1]val := submatches[2]if bytes.Compare(key, hptsKey) == 0 {e.hpts = string(val)}if bytes.Compare(key, hptshKey) == 0 {e.hptsh = string(val)}}return nil
}// Login handles the login action.
func (e *EvernoteClient) Login() error {// First step: fetch the login page as a browser visitor would do:res, err := e.httpClient.Get(evernoteLoginURL)if err != nil {return err}if res.Body == nil {return errors.New("No response body")}body, err := ioutil.ReadAll(res.Body)if err != nil {return err}err = e.extractJSParams(body)if err != nil {return err}// Second step: we have extracted the "hpts" and "hptsh" parameters// We send a request using only the username and setting "evaluateUsername":values := &url.Values{}values.Set("username", e.Username)values.Set("evaluateUsername", "")values.Set("analyticsLoginOrigin", "login_action")values.Set("clipperFlow", "false")values.Set("showSwitchService", "true")values.Set("hpts", e.hpts)values.Set("hptsh", e.hptsh)rawValues := values.Encode()req, err := http.NewRequest(http.MethodPost, evernoteLoginURL, bytes.NewBufferString(rawValues))if err != nil {return err}req.Header.Set("Accept", "application/json")req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")req.Header.Set("x-requested-with", "XMLHttpRequest")req.Header.Set("referer", evernoteLoginURL)res, err = e.httpClient.Do(req)if err != nil {return err}body, err = ioutil.ReadAll(res.Body)if err != nil {return err}bodyStr := string(body)if !strings.Contains(bodyStr, `"usePasswordAuth":true`) {return errors.New("Password auth not enabled")}// Third step: do the final request, append password to form data:values.Del("evaluateUsername")values.Set("password", e.Password)values.Set("login", "Sign in")rawValues = values.Encode()req, err = http.NewRequest(http.MethodPost, evernoteLoginURL, bytes.NewBufferString(rawValues))if err != nil {return err}req.Header.Set("Accept", "text/html")req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")req.Header.Set("x-requested-with", "XMLHttpRequest")req.Header.Set("referer", evernoteLoginURL)res, err = e.httpClient.Do(req)if err != nil {return err}// Check the body in order to find the redirect URL:body, err = ioutil.ReadAll(res.Body)if err != nil {return err}bodyStr = string(body)matches := evernoteRedirectExpr.FindAllStringSubmatch(bodyStr, -1)if len(matches) == 0 {return errRedirectURL}m := matches[0]if len(m) < 2 {return errRedirectURL}redirectURL := m[1]fmt.Println("Login is ok, redirect URL:", redirectURL)return nil
}
After you successfully get the redirect URL, you should be able to send authenticated requests as long as you keep using the HTTP client that was used for the login process, the cookie jar plays a very important role here.To call this code use:func main() {evernoteClient := NewEvernoteClient("user@company", "password")err := evernoteClient.Login()if err != nil {panic(err)}
}

只好自己写,经反复试验,发现对于本文开头自己写的server,只需以下代码即可通过验证,输出了hello,world!(将访问方式改为POST也一样。)

package mainimport ("fmt""io/ioutil""net/http"
)// Login handles the login action.
func Login() {//生成client 参数为默认client := &http.Client{}//要访问的urlurl := "http://localhost:8000/hello"//要提交的请求req, _ := http.NewRequest("GET", url, nil)//最重要的一句,用户名和密码可随意写req.SetBasicAuth("aa", "bb")fmt.Println("POST访问")//返回结果res, _ := client.Do(req)defer res.Body.Close()fmt.Println("header:")header := res.Headerfmt.Println(header)fmt.Println("realm:")basicRealm := res.Header.Get("Www-Authenticate")fmt.Println(basicRealm)fmt.Println("body:")body, _ := ioutil.ReadAll(res.Body)fmt.Println(string(body))}func main() {   Login()
}

查看SetBasicAuth的定义为(liteide中在光标位置按Ctrl+shift+J):

func (r *Request) SetBasicAuth(username, password string) {r.Header.Set("Authorization", "Basic "+basicAuth(username, password))
}

而basicAuth的定义为

func basicAuth(username, password string) string {auth := username + ":" + passwordreturn base64.StdEncoding.EncodeToString([]byte(auth))
}

那么,用gocolly访问的代码如下:

package mainimport ("encoding/base64""fmt""net/http""github.com/gocolly/colly"
)func basicAuth(username, password string) string {auth := username + ":" + passwordreturn base64.StdEncoding.EncodeToString([]byte(auth))
}
func main() {c := colly.NewCollector()h := http.Header{}h.Set("Authorization", "Basic "+basicAuth("aaaa", "bbbb"))c.OnResponse(func(r *colly.Response) {//fmt.Println(r)fmt.Println(string(r.Body))})c.Request("GET", "http://localhost:8000/hello", nil, nil, h)
}

注:对于其他网站,也许要用Fiddler抓包,设置相应的header和cookie才行。

转载于:https://www.cnblogs.com/pu369/p/10408898.html

(golang)HTTP基本认证机制及使用gocolly登录爬取相关推荐

  1. 克服反爬虫机制爬取智联招聘网站

    一.实验内容 1.爬取网站: 智联招聘网站(https://www.zhaopin.com/) 2.网站的反爬虫机制:     在我频繁爬取智联招聘网站之后,它会出现以下文字(尽管我已经控制了爬虫的爬 ...

  2. golang实现捧腹网爬取笑话

    爬虫的步骤见:here 以下golang代码实现对捧腹网笑话的爬取,并保存到本地的joy文件夹(程序会自行创建)内 package mainimport ("fmt""n ...

  3. jwt认证机制优势和原理_最详细的Spring Boot 使用JWT实现单点登录

    Json web token (JWT), 是为了在网络应用环境间传递声明而执行的一种基于JSON的开放标准((RFC 7519).该token被设计为紧凑且安全的,特别适用于分布式站点的单点登录(S ...

  4. SSO 认证机制对比

    2019独角兽企业重金招聘Python工程师标准>>> SSO 认证机制对比 博客分类: 架构 几种常用的认证机制 HTTP Basic Auth HTTP Basic Auth简单 ...

  5. 常用的认证机制之session认证和token认证

    一.session认证 1.session认证的过程: 前端输入用户名和密码进行登录操作,后端拿到用户名和密码后,会把md5进行加密,加密之后,拿上加密后的密文到用户表中查找密文是否一致,判断用户是否 ...

  6. 基于JWT的Token认证机制实现

    一.基于JWT的Token认证机制实现 1.什么是JWT JSON Web Token(JWT)是一个非常轻巧的规范.这个规范允许我们使用JWT在用户和服务器之间传递安全可靠的信息. 2.JWT组成 ...

  7. 深入详解windows安全认证机制ntlmKerberos

    0x01 为什么要理解windows 安全认证机制: 加深对后续各种漏洞利用的理解深度,还是那句话,要知其然,更要知其所以然,不废话,咱们直接开始 0x02 windows认证协议主要有以下两种: 基 ...

  8. elasticsearch httpclient认证机制

    转载自http://www.cnblogs.com/youran-he/p/7562870.html 最近公司单位搬迁,所有的服务都停止了,我负责的elasticsearch不知道怎么回事,一直不能运 ...

  9. web安全认证机制知多少

    如今web服务随处可见,成千上万的web程序被部署到公网上供用户访问,有些系统只针对指定用户开放,属于安全级别较高的web应用,他们需要有一种认证机制以保护系统资源的安全,本文将探讨五种常用的认证机制 ...

最新文章

  1. 研华物联网论坛和ARM技术研讨会随笔
  2. 设计模式 责任链模式
  3. Go语言---结构体
  4. 【MyBatis笔记】11-分步查询懒加载
  5. c++编码规范_汽车嵌入式软件测试——嵌入式软件标准及规范简介
  6. 《jQuery移动开发》—— 2.1 语义HTML5
  7. my04_Mysql复制数据一致性校验
  8. 编程实现二叉树的遍历
  9. 解决Android Studio Import Sample网络连接失败问题
  10. 微信公众号维护用服务器吗,微信公众号商城开发必须用服务器吗?
  11. OrCAD/Pspice元件库说明
  12. 优秀架构师必须具备的架构思维(自顶向下和自底向上架构设计思维)
  13. 1380Problem C:zyf的A+B问题
  14. 在Ubuntu20.04上安装ros
  15. 王艺瑞浙江大学计算机学院,关于公示2010年(秋)同等学力申请进入论文阶段学员名单的通知...
  16. 计算机二级不能使用快捷键,你不可不知的几个Office2010另类快捷键_计算机二级_Office快捷键_Office考试_课课家...
  17. 涅槃重生,力荐大型分布式手册,凤凰架构让你浴火成神,良心分享
  18. 在一页纸上打印8页PPT讲义的方法
  19. 从长远来看读博的收益是巨大的,为什么读博的人却很少?
  20. 阿里云RDS,登录提示:指定的网络名不再可用,错误64

热门文章

  1. 2019\Province_C_C++_B\试题E-迷宫
  2. python2.7 跨文件全局变量的方法
  3. 大数据WEB阶段Maven安装配置与使用
  4. 智能机器人领域有什么好书推荐的?
  5. 【Tools】MarkDown教程(五)-CSDN MarkDown介绍
  6. mysql字段数值累加_mysql字段值(字符串)累加 | 学步园
  7. swing打地鼠游戏_【亲子早教】9月早教亲子游戏
  8. 60条知乎神回复,大部分都比较有道理
  9. TCP/IP学习笔记(七)四次挥手
  10. 记录Hbase bug——org.apache.hadoop.hbase.PleaseHoldException: Master is initializing