python 爬虫 登录 SAML
(2017-08-20 22:39:49)
标签:
pythonrequestssmal爬虫 |
分类: python |
01
session_requests
= requests.session()
#session
会话, 用于跨请求保持某些参数 ;此文用到了两个session,
一个用于sp,另一个用于idp
headers={
'Host':'192.168.1.130',
'User-Agent':'Mozilla/5.0
(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like
Gecko) Chrome/51.0.2704.79 Safari/537.36
Edge/14.14393',
'Connection':'keep-alive'
}
#step
one, get file
#requests会默认处理重定向, allow_redirects=False,禁止重定向;verify=False,关闭证书验证
result
=
session_requests.get(LOGIN_URL,headers=headers,verify=False,allow_redirects=False)
print(result.text)
url0=abc.group(1)
#提取出找到的第一个
url0=result.headers['location']
#可以直接用这种方式提取location,因为它的headers是字典形式的,所以根据key取出它的value,以下所以提取链接类似;html编程中,href中的链接就是重定向的链接,即location中的值
print("+++++++++Below
is the url of sp++++++++++++\n")
print(url0)
cookie0=requests.utils.dict_from_cookiejar(result.cookies)
#requests.utils.dict_from_cookiejar
把cookie从cookiejar中提取成字典形式
print("++++++++cookies
is: ++++++++\n")
print(cookie0)
print("********+++++++++step
one is finished++++++++++++********\n")
#step
two,authenticate
result
=
session_requests.get(url0,verify=False,cookies=cookie0,allow_redirects=False)
print(result.text)
url_idp0=abc.group(1)
url_idp=re.sub(r';','&',url_idp0)
#前面
parse出来的url里relay前面是分号,需要把分号替换成&,re.sub(r';','&',url_idp0)
print("+++++++++Below
is the url of idp++++++++++++\n")
print(url_idp)
cookie1=requests.utils.dict_from_cookiejar(result.cookies)
print("++++++++cookies
is: ++++++++\n")
print(cookie1)
print("********+++++++++step
two is finished++++++++++++********\n")
#step
three, SSOService
a=re.search(r'(samlidp(.+?)txt)',url_idp)
#查找url_idp中samlidp到txt的部分,PS
:这一部分需要修改,因为最后不一定是txt
a=re.search(r'(samlidp(.+(.*?)))',url_idp)
#更新,这条会把samlidp和它后面所有的字符串找出来
b=a.group(1)
b=re.sub(r';','&',b)
cookie='"http/192.168.1.181/'+str(b)+'"'
print("b
is \n",b)
cookie_AAA={
"ANbookmark":cookie
}
print('cookie_AAA
is \n',cookie)
result
=
session_requests.get(url_idp,verify=False,cookies=cookie_AAA,allow_redirects=False)
print(result.text)
abc=re.search(r'"(/prx/.*?)"',result.text)
#查找引号之间的部分字符串,不包含引号,圆括号括起来哪些,就查找哪些
print("+++++++++Below
is the url of url_cookie++++++++++++\n")
print(url_cookie)
print("++++++++cookies
is: ++++++++\n")
cookie2=result.cookies
headers=result.headers
print("The
headers is: \n",headers)
print(cookie2)
print("********+++++++++step
three is finished++++++++++++********\n")
headers={
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,**;q=0.8",
"Accept-Encoding":"gzip,
deflate, sdch, br",
"Accept-Language":"zh-CN,zh;q=0.8",
"Cache-Control":"max-age=0",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0
(Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/58.0.3029.110 Safari/537.36",
"Referer":url_link
}
cookie00=dict(cookie0,**cookie1) #通过浏览器手动测试发现,给sp发的cookie是,step
one 和two里返回的cookie,这儿卡住了半天
print('The
cookies00 is',cookie00)
print('The
headers is : \n',headers)
result
=
session_requests2.post(url_sp,headers=headers,cookies=cookie00,data
= payload_saml,verify=False,allow_redirects=False)
print(result)
print(result.text)
cookies=result.cookies
print(cookies)
print("********+++++++++step
eight is finished++++++++++++********\n")
#step
ten
result
=
session_requests2.get(url_file,cookies=cookies,verify=False,allow_redirects=False)
print(result)
print(result.text)
cookies=result.cookies
print(cookies)
print("********+++++++++step
ten is finished++++++++++++********\n")
main()
session_requests
= requests.session()
print("===========starting
run step 1: ======================\n")
#step
1, get file
result
=
session_requests.get(LOGIN_URL,verify=False,allow_redirects=False)
print(result)
url0=result.headers['location']
print("+++++++++Below
is the url of sp++++++++++++\n")
print(url0)
cookie0=requests.utils.dict_from_cookiejar(result.cookies)
print("********+++++++++step
1 is finished++++++++++++********\n")
#step
two,authenticate
print("===========starting
run step 2: ======================\n")
result
=
session_requests.get(url0,verify=False,cookies=cookie0,allow_redirects=False)
print(result)
print("+++++++++Below
is the url of idp++++++++++++\n")
url_idp=result.headers['location']
print(url_idp)
cookie1=requests.utils.dict_from_cookiejar(result.cookies)
print("********+++++++++step
2 is finished++++++++++++********\n")
#step
three, SSOService
print("===========starting
run step 3: ======================\n")
a=re.search(r'(samlidp(.+(.*?)))',url_idp)
b=a.group(1)
cookie='"http/'+hostofvsite+'/'+str(b)+'"'
cookie_AAA={
"ANbookmark":cookie
}
result
=
session_requests.get(url_idp,verify=False,allow_redirects=False)
print(result)
abc=re.search(r'"(/prx/.*?)"',result.text)
print("+++++++++Below
is the url of url_cookie++++++++++++\n")
print(url_cookie)
print("++++++++cookies
is: ++++++++\n")
cookie2=result.cookies
headers=result.headers
print("********+++++++++step
3 is finished++++++++++++********\n")
#step
four, login idp
print("===========starting
run step 4: ======================\n")
uid0=uuid.uuid4()
uid1=uid0.hex
uid2=str(uid1)
uid3=uid2[0:16]
payload
= {
"method":"default_method_localdb",
"uname":
"test",
"pwd1":'',
"pwd2":'',
"pwd":
"hengniyiweiwohuixiewojin gchangyongdemimama",
"submitbutton":"Sign
In",
"uniqueid":uid3
}
cookielogin={
}
session_requests2
=
requests.session()
result
= session_requests2.post(url_login,cookies=cookie_AAA,data =
payload,verify=False,allow_redirects=False)
print(result)
cookie_l=result.cookies
cookie_l_0=requests.utils.dict_from_cookiejar(result.cookies)
print("********+++++++++step
four is finished++++++++++++********\n")
url_ssoservice=result.headers['location']
print("The
url_ssoservice is :\n",url_ssoservice)
#step
five: access ssoservice
print("===========starting
run step 5: ======================\n")
result
=
session_requests2.get(url_ssoservice,cookies=cookie_l,verify=False,allow_redirects=False)
print(result)
headforlink=result.headers
cookie_sso=requests.utils.dict_from_cookiejar(result.cookies)
print("********+++++++++step
5 is finished++++++++++++********\n")
url_link=result.headers['location']
print("The
url of link: \n",url_link)
cookie_link=dict(cookie_l_0,**cookie_sso)
#step
six, access link back
print("===========starting
run step 6: ======================\n")
result
=
session_requests2.get(url_link,headers=headforlink,cookies=cookie_link,verify=False,allow_redirects=False)
print(result)
cookies=requests.utils.dict_from_cookiejar(result.cookies)
print("********+++++++++step
six is finished++++++++++++********\n")
saml=re.search(r'value="(.*?)"',result.text)
samldata=saml.group(1)
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
#step
seven, access sp
print("===========starting
run step 7: ======================\n")
payload_saml={
"SAMLResponse":samldata,
"RelayState":LOGIN_URL
}
url_sp=' https://'+LOGIN_URL_host+'/prx/000/http/localhost/saml2/sp1/module.php/saml/sp/saml2-acs.php/sp1';
length=len(payload_saml)
print("The
url of sp: \n",url_sp)
cookie00=dict(cookie0,**cookie1)
result
= session_requests2.post(url_sp,cookies=cookie00,data =
payload_saml,verify=False,allow_redirects=False)
print(result)
cookies=result.cookies
print("********+++++++++step
7 is finished++++++++++++********\n")
#step
eight, access source
print("===========starting
run step 8: ======================\n")
result
=
session_requests2.get(LOGIN_URL,cookies=cookies,verify=False,allow_redirects=False)
print(result)
print(result.text)
cookies=result.cookies
print(cookies)
print("********+++++++++step
8 is finished++++++++++++********\n")
main()
坐在厨房蹭网,因为厨房网速最快,在下载一些东西,估计得要一段时间,就贴一篇上次花了好几天写的爬虫吧;
此爬虫是模拟登录我司的一个SAML验证系统的;关于SAML,自行查询;
02
详解:
#-*- encoding: utf-8 -*-
#Time :
2017/08/15
import requests
import time
import json
import re
import warnings
import uuid
#uuid是通用唯一识别码
(Universally Unique Identifier)
warnings.filterwarnings("ignore")
#忽略警告
from lxml import html
from requests.utils import quote
#quote用于url编码
print(LOGIN_URL)
def main():
# "Connection":"keep-alive",
# "Content-Type":"application/x-www-form-urlencoded",
#request会自动生成一些头部,所以这些是不必要的,下面的post甚至不需要传headers参数也正常工作
# "Content-Length":str(length),
# "Host":"192.168.1.130",
if __name__ == '__main__':
03
正式精简版
脚本
#-*- encoding: utf-8 -*-
#Time :
2017/08/15
import requests
import time
import json
import re
import warnings
import uuid
import sys
warnings.filterwarnings("ignore")
from lxml import html
from requests.utils import quote
#PS : In line 140, the "sp1" is same with the name of sp server on
proxy;
LOGIN_URL_host= "192.168.1.130"
hostofvsite="192.168.1.181"
print(LOGIN_URL)
#username=sys.argv[1] #In
formal test, change the value(test) of uname in line 81 to
username(no quotation marks);
#password=sys.argv[2] #In
formal test, change the value(test) of pwd in line 84 to
password(no quotation marks);
def main():
if __name__ == '__main__':
04
更新:
1.
最终版本中,将url也参数化了;
2.
增加了异常处理;很有必要;
3.
我忘了更新到笔记里了,也懒得改了,就这样吧;
05
总结:
SAML系统登录,要非常注意cookie的变化,总之,细心,细心,再细心;
06
厨房好冷,我要回去了;