加载中…
个人资料
  • 博客等级:
  • 博客积分:
  • 博客访问:
  • 关注人气:
  • 获赠金笔:0支
  • 赠出金笔:0支
  • 荣誉徽章:
正文 字体大小:

CURL并发--multi perform模式

(2013-04-27 20:31:55)
标签:

curl

multi_perform

分类: C/Cplusplus
// 数据结构
private:
    typedef struct _sm_data_info_t
    {
        char data[20480];
        int len;
    } sm_data_info_t;

    typedef struct _sm_url_info_t
    {
        CURL* url;
        int   index;
        Weizhang_Crawl_t* crawler;
    } sm_url_info_t;
    sm_url_info_t  _url_info[10];
    sm_data_info_t _array[10];

// 函数定义
int Weizhang_Crawl_t::fetch_zts(const char *url, const char *referer, char post_str[][WZ_L_TEXT], int num, int timeout)
{
    struct timeval start_tv, end_tv;
    gettimeofday(&start_tv, NULL);
    int spend_time = 0;
    
    CURLM* multi_handle = curl_multi_init(); 
    int still_running = 0;

    _num = num;
    for (int i = 0; i < _num; i++)
    {
        _array[i].len = 0;
        _array[i].data[0] = 0;
    }
    // 初始化并发的每个url
    for (int i = 0; i < num; ++i)
    {
        CURL* curl = curl_easy_init();
        _url_info[i].url = curl;
        _url_info[i].crawler = this;
        _url_info[i].index = i;
        curl_easy_setopt(curl, CURLOPT_POSTFIELDS, post_str[i]);
        curl_easy_setopt(curl, CURLOPT_REFERER, referer);
        curl_easy_setopt(curl, CURLOPT_URL, url);
        // 定义一个写数据的函数
        curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, __write_http_data);
        curl_easy_setopt(curl, CURLOPT_WRITEDATA, &(_url_info[i]));
        curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
        curl_easy_setopt(curl, CURLOPT_PROXY, _proxy);

        int ret = curl_multi_add_handle(multi_handle, curl);
        if (ret != CURLM_OK)
        {
            WRITE_LOG(LOG_WARNING, "failed to add handle.");
            return -1;
        }
    }
    //并发查询一次
    CURLMcode curlm_code = CURLM_CALL_MULTI_PERFORM;
    while(CURLM_CALL_MULTI_PERFORM == curlm_code) {
        curlm_code = curl_multi_perform(multi_handle, &still_running);
    }
    if (curlm_code != CURLM_OK) {
        WRITE_LOG(LOG_WARNING, "code[%d]msg[%s]", curlm_code, curl_multi_strerror(curlm_code));
        return -1;
    }
    // 采用select方式监听和接收数据,节约CPU资源,直到并发的所有数据接收成功
    struct timeval tv;
    tv.tv_sec = 4;  // 注意select时间,如果网站的速度比较慢,建议这个时间稍长,不然第一个select就timeout了,while循环就退出了,呵呵
    tv.tv_usec = 0;
    bool failed = false;
    while (still_running > 0 && !failed) 
    {
        int rc = -1;
        fd_set fdread;
        fd_set fdwrite;
        fd_set fdexcep;
        int maxfd = -1;
        FD_ZERO(&fdread);
        FD_ZERO(&fdwrite);
        FD_ZERO(&fdexcep);
        curl_multi_fdset(multi_handle, &fdread, &fdwrite, &fdexcep, &maxfd); 
        rc = select(maxfd + 1, &fdread, &fdwrite, &fdexcep, &tv); 
        switch(rc) 
        {
            case -1:
                failed = true;
                break;
            case 0:
                failed = true;
                break;
            default:
                while (1)
                {
                    int ret = curl_multi_perform(multi_handle, &still_running);
                    //printf("ret[%d]PERFORM[%d]\n", ret, CURLM_CALL_MULTI_PERFORM);
                    if (ret != CURLM_CALL_MULTI_PERFORM)
                    {
                        break;
                    }
                }
                
                break;
        }
        // 如果有时间限制的话,可以设置超时,避免并发时间过长
        gettimeofday(&end_tv, NULL);
        spend_time = (end_tv.tv_sec-start_tv.tv_sec)*1000 + (end_tv.tv_usec-start_tv.tv_usec)/1000;
        if ((spend_time + 1000) >= timeout) {
            WRITE_LOG(LOG_WARNING, "multi perform timeout, spend_time:%dms", spend_time);
            return -1;
        }
   
    // 释放资源
    curl_multi_cleanup(multi_handle); 
    for (int i = 0; i < num; i++)
    {
        curl_easy_cleanup(_url_info[i].url);
    }

    if (failed) {
        WRITE_LOG(LOG_WARNING, "something failed. spend_time:%dms, [%m]", spend_time);
        return -1;
    }

    // 检查数据是否都接收成功
    for (int i = 0; i < num; i++)
    {
        if (_array[i].len <= 0)
        {
            WRITE_LOG(LOG_WARNING, "failed to get url of post_str[%s]index[%d]len[%d]", post_str[i], i, _array[i].len);
            return -1;
        }
    }
    return 0;
}

static int __write_http_data(char* data, size_t size, size_t nmemb, void* userp)
{
    int len = size * nmemb;
    if (NULL == data || NULL == userp)
    {
        WRITE_LOG(LOG_WARNING, "invalid status.");
        return 0;
    }

    Weizhang_Crawl_t* crawler = ((Weizhang_Crawl_t::sm_url_info_t *) userp)->crawler;
    int index = ((Weizhang_Crawl_t::sm_url_info_t *) userp)->index;
    int ret = crawler->append_data(index, data, len);
    if (ret != 0)
    {
        WRITE_LOG(LOG_WARNING, "failed to append data.");
        return 0;
    }
    return len;
}

0

阅读 收藏 喜欢 打印举报/Report
  

新浪BLOG意见反馈留言板 欢迎批评指正

新浪简介 | About Sina | 广告服务 | 联系我们 | 招聘信息 | 网站律师 | SINA English | 产品答疑

新浪公司 版权所有