C++抓取网页内容
Windows VC下的做法:
#include <stdio.h>
#include <afxinet.h>
int main(int argc, char* argv[])
{
CInternetSession session("HttpClient");
char * url =
" http://www.imobile.com.cn/simcard.php?simcard=1392658";
CHttpFile*
pfile = (CHttpFile *)session.OpenURL(url);
DWORD
dwStatusCode;
pfile
-> QueryInfoStatusCode(dwStatusCode);
if(dwStatusCode == HTTP_STATUS_OK)
{
CString content;
CString data;
while (pfile -> ReadString(data))
{
content += data + "\r\n";
}
content.TrimRight();
printf(" %s\n " ,(LPCTSTR)content);
}
pfile
-> Close();
delete
pfile;
session.Close();
return 0 ;
}
Windows下用socket:
#include <string>
#include <iostream>
#include <fstream>
#include "winsock2.h"
#include <time.h>
#pragma comment(lib, "ws2_32.lib")
using namespace std;
#define DEFAULT_PAGE_BUF_SIZE 1048576
void main()
{
WSADATA
wsaData;
int
err;
err =
WSAStartup(MAKEWORD(2,2), &wsaData);
if( err != 0
)
{
return;
}
// timer
is start
clock_t
start, finish;
double
duration;
start =
clock();
char
host[] = "www.sina.com.cn";
char
*request = "GET / HTTP/1.0\r\nHost: www.sina.com.cn\r\nConnection:
Close\r\n\r\n";
struct
hostent *hp;
hp =
gethostbyname(host);
if(hp ==
NULL)
{
cout << "gethostbyname() error in
GetIpByHost: " << host
<< endl;
return;
}
//
获取域名对应的IP
struct
in_addr inAddr;
LPSTR
lpAddr;
lpAddr =
hp->h_addr;
memmove(&inAddr,lpAddr,4);
int sock,
ret = 0, optval = 1;
struct
sockaddr_in sa;
sa.sin_family =
AF_INET;
sa.sin_port
= htons(80);
sa.sin_addr.s_addr = inet_addr(inet_ntoa(inAddr));
sock =
socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
connect(sock, (SOCKADDR*)&sa, sizeof(sa));
if(sock ==
-1)
{
return;
}
if(sock ==
-2)
{
return;
}
// send
the "GET" data
ret =
send(sock, request, strlen(request), 0);
//
网页内容长度。可以从http头部数据中获取 "Content-Length:"
int
m_nContentLength = DEFAULT_PAGE_BUF_SIZE;
char
*pageBuf;
pageBuf =
(char *)malloc(m_nContentLength);
memset(pageBuf, 0, m_nContentLength);
int
bytesRead = 0;
while(ret
> 0)
{
ret = recv(sock, pageBuf + bytesRead, m_nContentLength - bytesRead,
0);
if(ret > 0)
{
bytesRead += ret;
}
}
pageBuf[bytesRead] = '\0';
cout
<< bytesRead
<< endl;
// write
the html content to the file
ofstream
ofs;
ofs.open("ofs.txt");
ofs
<< pageBuf
<< endl;
ofs.close();
free(pageBuf);
closesocket(sock);
WSACleanup();
// timer
is finish
finish =
clock();
duration =
(double)(finish - start) / CLOCKS_PER_SEC;
cout
<< "have cost "
<< duration
<< " seconds\n";
return;
}
其他如不从缓存中读取内容及如何使用代理连接现在就不说了,可以参考下面的链接,或者下次补上。另外不妨看看 Java 是如何读取 URL
内容的,更简单
GetMethod httpMethod = new GetMethod("http://unmi.blogcn.com");
int statusCode = new
HttpClient().executeMethod(httpMethod);
if(statusCode == HttpStatus.SC_OK)
{
System.out.println(httpMethod.getResponseBodyAsString());
}
httpMethod.releaseConnection();
GetMethod httpMethod = new GetMethod("http://unmi.blogcn.com");
int statusCode = new HttpClient().executeMethod(httpMethod);
if(statusCode == HttpStatus.SC_OK)
{
System.out.println(httpMethod.getResponseBodyAsString());
}
httpMethod.releaseConnection();
内容取过来之后,总是希望从中拣出需要的数据,可惜 VC6 中没有自己的正则表达式库,所以下一步要学用 boost
的正则表达式库。
Linux 下最简单,三种办法:
如果在C程序想抓取网页内容,比如百度
方法1. 执行System("wget http://www.baidu.com -q -O
baidu.html"),然后再分析baidu.html。
方法2. system(curl http://www.baidu.com)
方法3. 直接写socket程序获取http://www.baidu.com的内容存入buffer中。
参考链接: 1. VC++6.0 通过HTTP方式获取网页
2. CInternetSession获取网页内容的问题
3. 利用CInternetSession从网站获取信息,并利用CString拆分查找特定信息
加载中,请稍候......