// 数据结构
private:
typedef struct
_sm_data_info_t
{
char data[20480];
int len;
} sm_data_info_t;
typedef struct
_sm_url_info_t
{
CURL* url;
int index;
Weizhang_Crawl_t* crawler;
} sm_url_info_t;
sm_url_info_t
_url_info[10];
sm_data_info_t
_array[10];
// 函数定义
int Weizhang_Crawl_t::fetch_zts(const char *url, const char
*referer, char post_str[][WZ_L_TEXT], int num, int timeout)
{
struct timeval start_tv,
end_tv;
gettimeofday(&start_tv, NULL);
int spend_time =
0;
CURLM* multi_handle =
curl_multi_init();
int still_running =
0;
_num = num;
for (int i = 0; i <
_num; i++)
{
_array[i].len = 0;
_array[i].data[0] = 0;
}
// 初始化并发的每个url
for (int i = 0; i <
num; ++i)
{
CURL* curl = curl_easy_init();
_url_info[i].url = curl;
_url_info[i].crawler = this;
_url_info[i].index = i;
curl_easy_setopt(curl, CURLOPT_POSTFIELDS,
post_str[i]);
curl_easy_setopt(curl, CURLOPT_REFERER,
referer);
curl_easy_setopt(curl, CURLOPT_URL, url);
//
定义一个写数据的函数
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION,
__write_http_data);
curl_easy_setopt(curl, CURLOPT_WRITEDATA,
&(_url_info[i]));
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION,
1);
curl_easy_setopt(curl, CURLOPT_PROXY,
_proxy);
int ret = curl_multi_add_handle(multi_handle,
curl);
if (ret != CURLM_OK)
{
WRITE_LOG(LOG_WARNING, "failed to add handle.");
return
-1;
}
}
//并发查询一次
CURLMcode curlm_code =
CURLM_CALL_MULTI_PERFORM;
while(CURLM_CALL_MULTI_PERFORM == curlm_code) {
curlm_code = curl_multi_perform(multi_handle,
&still_running);
}
if (curlm_code !=
CURLM_OK) {
WRITE_LOG(LOG_WARNING, "code[%d]msg[%s]",
curlm_code, curl_multi_strerror(curlm_code));
return -1;
}
//
采用select方式监听和接收数据,节约CPU资源,直到并发的所有数据接收成功
struct timeval tv;
tv.tv_sec = 4;
//
注意select时间,如果网站的速度比较慢,建议这个时间稍长,不然第一个select就timeout了,while循环就退出了,呵呵
tv.tv_usec = 0;
bool failed =
false;
while (still_running
> 0 && !failed)
{
int rc = -1;
fd_set fdread;
fd_set fdwrite;
fd_set fdexcep;
int maxfd = -1;
FD_ZERO(&fdread);
FD_ZERO(&fdwrite);
FD_ZERO(&fdexcep);
curl_multi_fdset(multi_handle, &fdread,
&fdwrite, &fdexcep, &maxfd);
rc = select(maxfd + 1, &fdread,
&fdwrite, &fdexcep, &tv);
switch(rc)
{
case
-1:
failed = true;
break;
case
0:
failed = true;
break;
default:
while (1)
{
int ret = curl_multi_perform(multi_handle,
&still_running);
//printf("ret[%d]PERFORM[%d]\n", ret,
CURLM_CALL_MULTI_PERFORM);
if (ret != CURLM_CALL_MULTI_PERFORM)
{
break;
}
}
break;
}
//
如果有时间限制的话,可以设置超时,避免并发时间过长
gettimeofday(&end_tv, NULL);
spend_time =
(end_tv.tv_sec-start_tv.tv_sec)*1000 +
(end_tv.tv_usec-start_tv.tv_usec)/1000;
if ((spend_time + 1000) >= timeout) {
WRITE_LOG(LOG_WARNING, "multi perform timeout, spend_time:%dms",
spend_time);
return
-1;
}
}
// 释放资源
curl_multi_cleanup(multi_handle);
for (int i = 0; i <
num; i++)
{
curl_easy_cleanup(_url_info[i].url);
}
if (failed) {
WRITE_LOG(LOG_WARNING, "something failed.
spend_time:%dms, [%m]", spend_time);
return -1;
}
// 检查数据是否都接收成功
for (int i = 0; i <
num; i++)
{
if (_array[i].len <= 0)
{
WRITE_LOG(LOG_WARNING, "failed to get url of
post_str[%s]index[%d]len[%d]", post_str[i], i,
_array[i].len);
return
-1;
}
}
return 0;
}
static int __write_http_data(char* data, size_t size, size_t
nmemb, void* userp)
{
int len = size *
nmemb;
if (NULL == data || NULL
== userp)
{
WRITE_LOG(LOG_WARNING, "invalid status.");
return 0;
}
Weizhang_Crawl_t*
crawler = ((Weizhang_Crawl_t::sm_url_info_t *)
userp)->crawler;
int index =
((Weizhang_Crawl_t::sm_url_info_t *) userp)->index;
int ret =
crawler->append_data(index, data, len);
if (ret != 0)
{
WRITE_LOG(LOG_WARNING, "failed to append
data.");
return 0;
}
return len;
}
加载中,请稍候......