python 爬虫登录 SAML_Ilvuniverse

http://blog.sina.com.cn/u/6085985271

首页博文目录关于我

个人资料

微博

加好友发纸条

写留言加关注

博客等级：
博客积分：

博客访问：
关注人气：
获赠金笔：0支
赠出金笔：0支
荣誉徽章：

正文字体大小：大中小

python 爬虫登录 SAML

(2017-08-20 22:39:49)

标签：

python

requests

smal

爬虫

分类： python

坐在厨房蹭网，因为厨房网速最快，在下载一些东西，估计得要一段时间，就贴一篇上次花了好几天写的爬虫吧；

此爬虫是模拟登录我司的一个SAML验证系统的；关于SAML，自行查询；

详解：

#-*- encoding: utf-8 -*-

#Time  :
2017/08/15

import requests

import time

import json

import re

import warnings

import uuid
                                                    #uuid是通用唯一识别码
(Universally Unique Identifier)

warnings.filterwarnings("ignore")
                      #忽略警告

from lxml import html

from requests.utils import quote
                      #quote用于url编码

LOGIN_URL = "https://192.168.1.130/b.txt";

print(LOGIN_URL)

def main():

    session_requests
= requests.session()
          #session
会话， 用于跨请求保持某些参数 ；此文用到了两个session,
一个用于sp，另一个用于idp   

    headers={

        'Host':'192.168.1.130',

        'User-Agent':'Mozilla/5.0
(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like
Gecko) Chrome/51.0.2704.79 Safari/537.36
Edge/14.14393',

        'Connection':'keep-alive'

        }

    #step
one, get file

    #requests会默认处理重定向，allow_redirects=False，禁止重定向；verify=False，关闭证书验证

    result
=
session_requests.get(LOGIN_URL,headers=headers,verify=False,allow_redirects=False)
   
    print(result.text)

    abc=re.search(r'"(https://.*?)"',result.text)
     #从result中查找两个引号之间https://及它后面的所有字符串

    url0=abc.group(1)
                                     #提取出找到的第一个

    url0=result.headers['location']
                      #可以直接用这种方式提取location，因为它的headers是字典形式的，所以根据key取出它的value,以下所以提取链接类似；html编程中，href中的链接就是重定向的链接，即location中的值

    print("+++++++++Below
is the url of sp++++++++++++\n")

    print(url0)

    cookie0=requests.utils.dict_from_cookiejar(result.cookies)
     #requests.utils.dict_from_cookiejar
把cookie从cookiejar中提取成字典形式

    print("++++++++cookies
is: ++++++++\n")

    print(cookie0)

    print("********+++++++++step
one is finished++++++++++++********\n")

    #step
two,authenticate

    result
=
session_requests.get(url0,verify=False,cookies=cookie0,allow_redirects=False)

    print(result.text)

    abc=re.search(r'"(https://.*?)"',result.text)

    url_idp0=abc.group(1)

    url_idp=re.sub(r';','&',url_idp0)
                   #前面
parse出来的url里relay前面是分号，需要把分号替换成&，re.sub(r';','&',url_idp0)

    print("+++++++++Below
is the url of idp++++++++++++\n")

    print(url_idp)

    cookie1=requests.utils.dict_from_cookiejar(result.cookies)

    print("++++++++cookies
is: ++++++++\n")

    print(cookie1)

    print("********+++++++++step
two is finished++++++++++++********\n")

    #step
three, SSOService

    a=re.search(r'(samlidp(.+?)txt)',url_idp)
     #查找url_idp中samlidp到txt的部分，PS
:这一部分需要修改，因为最后不一定是txt

    a=re.search(r'(samlidp(.+(.*?)))',url_idp)
#更新，这条会把samlidp和它后面所有的字符串找出来

    b=a.group(1)

    b=re.sub(r';','&',b)

    cookie='"http/192.168.1.181/'+str(b)+'"'

    print("b
is \n",b)

    cookie_AAA={

        "ANbookmark":cookie

        }

    print('cookie_AAA
is \n',cookie)

    result
=
session_requests.get(url_idp,verify=False,cookies=cookie_AAA,allow_redirects=False)

    print(result.text)

    abc=re.search(r'"(/prx/.*?)"',result.text)
      #查找引号之间的部分字符串，不包含引号，圆括号括起来哪些，就查找哪些

    url_cookie="https://192.168.1.181"+abc.group(1)

    print("+++++++++Below
is the url of url_cookie++++++++++++\n")

    print(url_cookie)

    print("++++++++cookies
is: ++++++++\n")

    cookie2=result.cookies

    headers=result.headers

    print("The
headers is: \n",headers)

    print(cookie2)

    print("********+++++++++step
three is finished++++++++++++********\n")

    headers={

        "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,**;q=0.8",

        "Accept-Encoding":"gzip,
deflate, sdch, br",

        "Accept-Language":"zh-CN,zh;q=0.8",

        "Cache-Control":"max-age=0",

#        "Connection":"keep-alive",

#        "Content-Type":"application/x-www-form-urlencoded",
 #request会自动生成一些头部，所以这些是不必要的，下面的post甚至不需要传headers参数也正常工作

#        "Content-Length":str(length),

#        "Host":"192.168.1.130",

        "Origin":"https://192.168.1.181";,

        "Upgrade-Insecure-Requests":"1",

        "User-Agent":"Mozilla/5.0
(Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/58.0.3029.110 Safari/537.36",

        "Referer":url_link

        }

    cookie00=dict(cookie0,**cookie1)                                     #通过浏览器手动测试发现，给sp发的cookie是，step
one 和two里返回的cookie，这儿卡住了半天

    print('The
cookies00 is',cookie00)

    print('The
headers is : \n',headers)

    result
=
session_requests2.post(url_sp,headers=headers,cookies=cookie00,data
=payload_saml,verify=False,allow_redirects=False)

    print(result)

    print(result.text)

    cookies=result.cookies

    print(cookies)

    print("********+++++++++step
eight is finished++++++++++++********\n")

    url_file="https://192.168.1.130/b.txt";

    #step
ten

    result
=
session_requests2.get(url_file,cookies=cookies,verify=False,allow_redirects=False)

    print(result)

    print(result.text)

    cookies=result.cookies

    print(cookies)

    print("********+++++++++step
ten is finished++++++++++++********\n")

if __name__ == '__main__':

    main()

正式精简版

脚本

#-*- encoding: utf-8 -*-

#Time  :
2017/08/15

import requests

import time

import json

import re

import warnings

import uuid

import sys

warnings.filterwarnings("ignore")

from lxml import html

from requests.utils import quote

#PS : In line 140, the "sp1" is same with the name of sp server on
proxy;

LOGIN_URL = "https://192.168.1.130/b.txt";

LOGIN_URL_host= "192.168.1.130"

hostofvsite="192.168.1.181"

print(LOGIN_URL)

#username=sys.argv[1]                #In
formal test, change the value(test) of uname in line 81 to
username(no quotation marks);

#password=sys.argv[2]                #In
formal test, change the value(test) of pwd in line 84 to
password(no quotation marks);

def main():

    session_requests
= requests.session()

    print("===========starting
run step 1: ======================\n")

    #step
1, get file

    result
=
session_requests.get(LOGIN_URL,verify=False,allow_redirects=False)

    print(result)

    url0=result.headers['location']

    print("+++++++++Below
is the url of sp++++++++++++\n")

    print(url0)

    cookie0=requests.utils.dict_from_cookiejar(result.cookies)

    print("********+++++++++step
1 is finished++++++++++++********\n")

    #step
two,authenticate

    print("===========starting
run step 2: ======================\n")

    result
=
session_requests.get(url0,verify=False,cookies=cookie0,allow_redirects=False)

    print(result)

    print("+++++++++Below
is the url of idp++++++++++++\n")

    url_idp=result.headers['location']

    print(url_idp)

    cookie1=requests.utils.dict_from_cookiejar(result.cookies)

    print("********+++++++++step
2 is finished++++++++++++********\n")

    #step
three, SSOService

    print("===========starting
run step 3: ======================\n")

    a=re.search(r'(samlidp(.+(.*?)))',url_idp)

    b=a.group(1)

    cookie='"http/'+hostofvsite+'/'+str(b)+'"'

    cookie_AAA={

        "ANbookmark":cookie

        }

    result
=
session_requests.get(url_idp,verify=False,allow_redirects=False)

    print(result)

    abc=re.search(r'"(/prx/.*?)"',result.text)

    url_cookie='https://'+hostofvsite+abc.group(1)

    print("+++++++++Below
is the url of url_cookie++++++++++++\n")

    print(url_cookie)

    print("++++++++cookies
is: ++++++++\n")

    cookie2=result.cookies

    headers=result.headers

    print("********+++++++++step
3 is finished++++++++++++********\n")

    #step
four, login idp

    print("===========starting
run step 4: ======================\n")

    uid0=uuid.uuid4()

    uid1=uid0.hex

    uid2=str(uid1)

    uid3=uid2[0:16]

    payload
= {

        "method":"default_method_localdb",

        "uname":
"test",

        "pwd1":'',

        "pwd2":'',

        "pwd":
"hengniyiweiwohuixiewojingchangyongdemimama",

        "submitbutton":"Sign
In",

        "uniqueid":uid3

    }

    cookielogin={

        }

    url_login='https://'+hostofvsite+'/prx/000/http/localh/login';

    session_requests2
=
requests.session()    

    result
= session_requests2.post(url_login,cookies=cookie_AAA,data =
payload,verify=False,allow_redirects=False)

    print(result)

    cookie_l=result.cookies

    cookie_l_0=requests.utils.dict_from_cookiejar(result.cookies)

    print("********+++++++++step
four is finished++++++++++++********\n")

    url_ssoservice=result.headers['location']

    print("The
url_ssoservice is :\n",url_ssoservice)

    #step
five: access ssoservice

    print("===========starting
run step 5: ======================\n")

    result
=
session_requests2.get(url_ssoservice,cookies=cookie_l,verify=False,allow_redirects=False)

    print(result)

    headforlink=result.headers

    cookie_sso=requests.utils.dict_from_cookiejar(result.cookies)

    print("********+++++++++step
5 is finished++++++++++++********\n")

    url_link=result.headers['location']

    print("The
url of link: \n",url_link)

    cookie_link=dict(cookie_l_0,**cookie_sso)

    #step
six, access link back

    print("===========starting
run step 6: ======================\n")

    result
=
session_requests2.get(url_link,headers=headforlink,cookies=cookie_link,verify=False,allow_redirects=False)

    print(result)

    cookies=requests.utils.dict_from_cookiejar(result.cookies)

    print("********+++++++++step
six is finished++++++++++++********\n")

    saml=re.search(r'value="(.*?)"',result.text)

    samldata=saml.group(1)

    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")

    #step
seven, access sp

    print("===========starting
run step 7: ======================\n")

    payload_saml={

        "SAMLResponse":samldata,

        "RelayState":LOGIN_URL

        }

    url_sp='https://'+LOGIN_URL_host+'/prx/000/http/localhost/saml2/sp1/module.php/saml/sp/saml2-acs.php/sp1';

    length=len(payload_saml)

    print("The
url of sp: \n",url_sp)

    cookie00=dict(cookie0,**cookie1)

    result
= session_requests2.post(url_sp,cookies=cookie00,data =
payload_saml,verify=False,allow_redirects=False)

    print(result)

    cookies=result.cookies

    print("********+++++++++step
7 is finished++++++++++++********\n")

    #step
eight, access source

    print("===========starting
run step 8: ======================\n")

    result
=
session_requests2.get(LOGIN_URL,cookies=cookies,verify=False,allow_redirects=False)

    print(result)

    print(result.text)

    cookies=result.cookies

    print(cookies)

    print("********+++++++++step
8 is finished++++++++++++********\n")

if __name__ == '__main__':

    main()

更新：

1. 最终版本中，将url也参数化了；

2. 增加了异常处理；很有必要；

3. 我忘了更新到笔记里了，也懒得改了，就这样吧；

总结：

SAML系统登录，要非常注意cookie的变化，总之，细心，细心，再细心；

厨房好冷，我要回去了；

阅读┊ 收藏 ┊ 喜欢 ▼ ┊打印┊举报/Report

前一篇：python 写入文件多线程 restapi

后一篇：python 爬虫下载图片

新浪BLOG意见反馈留言板　欢迎批评指正

python 爬虫 登录 SAML

python

requests

smal

爬虫

python 爬虫登录 SAML