加载中…
个人资料
  • 博客等级:
  • 博客积分:
  • 博客访问:
  • 关注人气:
  • 获赠金笔:0支
  • 赠出金笔:0支
  • 荣誉徽章:
正文 字体大小:

python 爬虫 登录 SAML

(2017-08-20 22:39:49)
标签:

python

requests

smal

爬虫

分类: python
01
坐在厨房蹭网,因为厨房网速最快,在下载一些东西,估计得要一段时间,就贴一篇上次花了好几天写的爬虫吧;
此爬虫是模拟登录我司的一个SAML验证系统的;关于SAML,自行查询;

02
详解:
#-*- encoding: utf-8 -*-
#Time  : 2017/08/15
import requests
import time
import json
import re
import warnings
import uuid                                                     #uuid是通用唯一识别码 (Universally Unique Identifier)
warnings.filterwarnings("ignore")                       #忽略警告
from lxml import html
from requests.utils import quote                       #quote用于url编码

print(LOGIN_URL)

def main():
    session_requests = requests.session()           #session 会话, 用于跨请求保持某些参数 ;此文用到了两个session, 一个用于sp,另一个用于idp   
    headers={
        'Host':'192.168.1.130',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393',
        'Connection':'keep-alive'
        }

    #step one, get file
    #requests会默认处理重定向,allow_redirects=False,禁止重定向;verify=False,关闭证书验证

    result = session_requests.get(LOGIN_URL,headers=headers,verify=False,allow_redirects=False)    
    print(result.text)
    abc=re.search(r'"(https://.*?)"',result.text)      #从result中查找两个引号之间https://及它后面的所有字符串
    url0=abc.group(1)                                      #提取出找到的第一个
    url0=result.headers['location']                       #可以直接用这种方式提取location,因为它的headers是字典形式的,所以根据key取出它的value,以下所以提取链接类似;html编程中,href中的链接就是重定向的链接,即location中的值
    print("+++++++++Below is the url of sp++++++++++++\n")
    print(url0)
    cookie0=requests.utils.dict_from_cookiejar(result.cookies)      #requests.utils.dict_from_cookiejar 把cookie从cookiejar中提取成字典形式
    print("++++++++cookies is: ++++++++\n")
    print(cookie0)
    print("********+++++++++step one is finished++++++++++++********\n")
    
    #step two,authenticate

    result = session_requests.get(url0,verify=False,cookies=cookie0,allow_redirects=False)
    print(result.text)
    abc=re.search(r'"(https://.*?)"',result.text)
    url_idp0=abc.group(1)
    url_idp=re.sub(r';','&',url_idp0)                    #前面 parse出来的url里relay前面是分号,需要把分号替换成&,re.sub(r';','&',url_idp0)
    print("+++++++++Below is the url of idp++++++++++++\n")
    print(url_idp)
    cookie1=requests.utils.dict_from_cookiejar(result.cookies)
    print("++++++++cookies is: ++++++++\n")
    print(cookie1)
    print("********+++++++++step two is finished++++++++++++********\n")

    #step three, SSOService
    a=re.search(r'(samlidp(.+?)txt)',url_idp)      #查找url_idp中samlidp到txt的部分,PS :这一部分需要修改,因为最后不一定是txt
    a=re.search(r'(samlidp(.+(.*?)))',url_idp) #更新,这条会把samlidp和它后面所有的字符串找出来
    b=a.group(1)
    b=re.sub(r';','&',b)
    cookie='"http/192.168.1.181/'+str(b)+'"'
    
    print("b is \n",b)
    cookie_AAA={
        "ANbookmark":cookie
        }
    print('cookie_AAA is \n',cookie)
    result = session_requests.get(url_idp,verify=False,cookies=cookie_AAA,allow_redirects=False)
    print(result.text)
    abc=re.search(r'"(/prx/.*?)"',result.text)       #查找引号之间的部分字符串,不包含引号,圆括号括起来哪些,就查找哪些
    
    
    print("+++++++++Below is the url of url_cookie++++++++++++\n")
    print(url_cookie)
    print("++++++++cookies is: ++++++++\n")
    cookie2=result.cookies
    headers=result.headers
    print("The headers is: \n",headers)
    print(cookie2)
    print("********+++++++++step three is finished++++++++++++********\n")
    
    headers={
        "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,**;q=0.8",
        "Accept-Encoding":"gzip, deflate, sdch, br",
        "Accept-Language":"zh-CN,zh;q=0.8",
        "Cache-Control":"max-age=0",
       "Connection":"keep-alive",
       "Content-Type":"application/x-www-form-urlencoded",  #request会自动生成一些头部,所以这些是不必要的,下面的post甚至不需要传headers参数也正常工作
       "Content-Length":str(length),
       "Host":"192.168.1.130",
        "Origin":"https://192.168.1.181";,
        "Upgrade-Insecure-Requests":"1",
        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
        "Referer":url_link
        }
    cookie00=dict(cookie0,**cookie1)                                     #通过浏览器手动测试发现,给sp发的cookie是,step one 和two里返回的cookie,这儿卡住了半天
    print('The cookies00 is',cookie00)
    print('The headers is : \n',headers)
    result = session_requests2.post(url_sp,headers=headers,cookies=cookie00,data =payload_saml,verify=False,allow_redirects=False)
    print(result)
    print(result.text)
    cookies=result.cookies
    print(cookies)
    print("********+++++++++step eight is finished++++++++++++********\n")

    url_file="https://192.168.1.130/b.txt";
    #step ten
    result = session_requests2.get(url_file,cookies=cookies,verify=False,allow_redirects=False)
    print(result)
    print(result.text)
    cookies=result.cookies
    print(cookies)
    print("********+++++++++step ten is finished++++++++++++********\n")
    
if __name__ == '__main__':
    main()


03
正式精简版
脚本

#-*- encoding: utf-8 -*-
#Time  : 2017/08/15
import requests
import time
import json
import re
import warnings
import uuid
import sys
warnings.filterwarnings("ignore")
from lxml import html
from requests.utils import quote
#PS : In line 140, the "sp1" is same with the name of sp server on proxy;
LOGIN_URL_host= "192.168.1.130"
hostofvsite="192.168.1.181"
print(LOGIN_URL)
#username=sys.argv[1]                #In formal test, change the value(test) of uname in line 81 to username(no quotation marks);
#password=sys.argv[2]                #In formal test, change the value(test) of pwd in line 84 to password(no quotation marks);
def main():
    session_requests = requests.session()
    print("===========starting run step 1: ======================\n")
    #step 1, get file
    result = session_requests.get(LOGIN_URL,verify=False,allow_redirects=False)
    print(result)
    url0=result.headers['location']
    print("+++++++++Below is the url of sp++++++++++++\n")
    print(url0)
    cookie0=requests.utils.dict_from_cookiejar(result.cookies)
    print("********+++++++++step 1 is finished++++++++++++********\n")
    
    #step two,authenticate
    print("===========starting run step 2: ======================\n")
    result = session_requests.get(url0,verify=False,cookies=cookie0,allow_redirects=False)
    print(result)
    print("+++++++++Below is the url of idp++++++++++++\n")
    url_idp=result.headers['location']
    print(url_idp)
    cookie1=requests.utils.dict_from_cookiejar(result.cookies)
    print("********+++++++++step 2 is finished++++++++++++********\n")
    #step three, SSOService
    print("===========starting run step 3: ======================\n")
    a=re.search(r'(samlidp(.+(.*?)))',url_idp)
    b=a.group(1)
    cookie='"http/'+hostofvsite+'/'+str(b)+'"'
    
    cookie_AAA={
        "ANbookmark":cookie
        }
    result = session_requests.get(url_idp,verify=False,allow_redirects=False)
    print(result)
    abc=re.search(r'"(/prx/.*?)"',result.text)
    
    url_cookie='https://'+hostofvsite+abc.group(1)
    
    print("+++++++++Below is the url of url_cookie++++++++++++\n")
    print(url_cookie)
    print("++++++++cookies is: ++++++++\n")
    cookie2=result.cookies
    headers=result.headers
    print("********+++++++++step 3 is finished++++++++++++********\n")
    #step four, login idp
    print("===========starting run step 4: ======================\n")
    uid0=uuid.uuid4()
    uid1=uid0.hex
    uid2=str(uid1)
    uid3=uid2[0:16]
    payload = {
        "method":"default_method_localdb",
        "uname": "test",
        "pwd1":'',
        "pwd2":'',
        "pwd": "hengniyiweiwohuixiewojingchangyongdemimama",
        "submitbutton":"Sign In",
        "uniqueid":uid3
    }
    cookielogin={
        
        }
    
    session_requests2 = requests.session()    
    result = session_requests2.post(url_login,cookies=cookie_AAA,data = payload,verify=False,allow_redirects=False)
    print(result)
    cookie_l=result.cookies
    cookie_l_0=requests.utils.dict_from_cookiejar(result.cookies)
    print("********+++++++++step four is finished++++++++++++********\n")
    
    url_ssoservice=result.headers['location']
    print("The url_ssoservice is :\n",url_ssoservice)
    
    #step five: access ssoservice
    print("===========starting run step 5: ======================\n")
    result = session_requests2.get(url_ssoservice,cookies=cookie_l,verify=False,allow_redirects=False)
    print(result)
    headforlink=result.headers
    cookie_sso=requests.utils.dict_from_cookiejar(result.cookies)
    
    print("********+++++++++step 5 is finished++++++++++++********\n")
    
    url_link=result.headers['location']
    print("The url of link: \n",url_link)
    cookie_link=dict(cookie_l_0,**cookie_sso)
    
    #step six, access link back
    print("===========starting run step 6: ======================\n")
    result = session_requests2.get(url_link,headers=headforlink,cookies=cookie_link,verify=False,allow_redirects=False)
    print(result)
    cookies=requests.utils.dict_from_cookiejar(result.cookies)
    
    print("********+++++++++step six is finished++++++++++++********\n")
    
    saml=re.search(r'value="(.*?)"',result.text)
    samldata=saml.group(1)
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
    #step seven, access sp
    print("===========starting run step 7: ======================\n")
    payload_saml={
        "SAMLResponse":samldata,
        "RelayState":LOGIN_URL
        }
    length=len(payload_saml)
    print("The url of sp: \n",url_sp)
    cookie00=dict(cookie0,**cookie1)
    result = session_requests2.post(url_sp,cookies=cookie00,data = payload_saml,verify=False,allow_redirects=False)
    print(result)
    cookies=result.cookies
    print("********+++++++++step 7 is finished++++++++++++********\n")
    #step eight, access source
    print("===========starting run step 8: ======================\n")
    result = session_requests2.get(LOGIN_URL,cookies=cookies,verify=False,allow_redirects=False)
    print(result)
    print(result.text)
    cookies=result.cookies
    print(cookies)
    print("********+++++++++step 8 is finished++++++++++++********\n")
    
if __name__ == '__main__':
    main()


04
更新:
1. 最终版本中,将url也参数化了;
2. 增加了异常处理;很有必要;
3. 我忘了更新到笔记里了,也懒得改了,就这样吧;

05
总结:
SAML系统登录,要非常注意cookie的变化,总之,细心,细心,再细心;

06
厨房好冷,我要回去了;

0

阅读 收藏 喜欢 打印举报/Report
  

新浪BLOG意见反馈留言板 欢迎批评指正

新浪简介 | About Sina | 广告服务 | 联系我们 | 招聘信息 | 网站律师 | SINA English | 产品答疑

新浪公司 版权所有