加载中…
个人资料
hhshnsq
hhshnsq
  • 博客等级:
  • 博客积分:0
  • 博客访问:3,215
  • 关注人气:187
  • 获赠金笔:0支
  • 赠出金笔:0支
  • 荣誉徽章:
正文 字体大小:

SAS编程技巧 -- 宏循环 PK 数据步循环

(2013-01-08 21:18:51)
标签:

杂谈

分类: SAS_Quick_Tips

初学 SAS 的人,有时难免混淆宏循环和数据步循环。今天看到一段代码,目的是要对此代码做性能方面的优化,在这个代码里我发现最低效的代码就是没有正确使用宏循环,或者因为作者对于宏语言还不够熟悉,所以最终放弃了宏循环而改用数据步循环。

 

为了说明两种循环的区别,我们先看一下两种方法最后生成的 SAS 代码以及运行时间。

 

1)数据步循环

data temp;
retain rule_statement ;
length rule_statement $5000 ;
set rule_data(where = (rule_id = "r1"));
if 2 = 2 then do;
rule_statement = 'if '||"rc_term1 = "||quote(trim(rc_term1));
if 3 = 2 then do;
rule_statement = trim(left(rule_statement))||' then do ; ';
end;
end;
else if 2 = 3 then do;
rule_statement = trim(left(rule_statement))||" and rc_term1 = "||quote(trim(rc_term1))||' then do ; ';
end;
else do;
rule_statement = trim(left(rule_statement))||" and rc_term1 = "||quote(trim(rc_term1));
end;
if 3 = 2 then do;
rule_statement = 'if '||"rc_term2 = "||quote(trim(rc_term2));
if 3 = 2 then do;
rule_statement = trim(left(rule_statement))||' then do ; ';
end;
end;
else if 3 = 3 then do;
rule_statement = trim(left(rule_statement))||" and rc_term2 = "||quote(trim(rc_term2))||' then do ; ';
end;
else do;
rule_statement = trim(left(rule_statement))||" and rc_term2 = "||quote(trim(rc_term2));
end;
if 2 = 2 then do;
rule_statement = trim(left(rule_statement))||"_ra_term1 = "||quote(trim(ra_term1))||';';
end;
else if 2 = 4 then do;
rule_statement = trim(left(rule_statement))||"  _ra_term1 = "||quote(trim(ra_term1))||';' ||'
sequence = '||left(sequence)||'; output rule_result; end;';
call symput ('rule_statement'||left(put(_n_,15.)), rule_statement);
call symput("rule_statement_count",_n_);
end;
else do;
rule_statement = trim(left(rule_statement))||" _ra_term1 = "||quote(trim(ra_term1))||';';
end;
if 3 = 2 then do;
rule_statement = trim(left(rule_statement))||"_ra_term2 = "||quote(trim(ra_term2))||';';
end;
else if 3 = 4 then do;
rule_statement = trim(left(rule_statement))||"  _ra_term2 = "||quote(trim(ra_term2))||';' ||'
sequence = '||left(sequence)||'; output rule_result; end;';
call symput ('rule_statement'||left(put(_n_,15.)), rule_statement);
call symput("rule_statement_count",_n_);
end;
else do;
rule_statement = trim(left(rule_statement))||" _ra_term2 = "||quote(trim(ra_term2))||';';
end;
if 4 = 2 then do;
rule_statement = trim(left(rule_statement))||"_ra_term3 = "||quote(trim(ra_term3))||';';
end;
else if 4 = 4 then do;
rule_statement = trim(left(rule_statement))||"  _ra_term3 = "||quote(trim(ra_term3))||';' ||'
sequence = '||left(sequence)||'; output rule_result; end;';
call symput ('rule_statement'||left(put(_n_,15.)), rule_statement);
call symput("rule_statement_count",_n_);
end;
else do;
rule_statement = trim(left(rule_statement))||" _ra_term3 = "||quote(trim(ra_term3))||';';
end;
run;

 

NOTE: There were 1500 observations read from the data set WORK.RULE_DATA.
      WHERE rule_id='r1';
NOTE: The data set WORK.TEMP has 1500 observations and 9 variables.
NOTE: DATA statement used (Total process time):
      real time           2.63 seconds
      cpu time            2.54 seconds

 

2)宏循环

data temp;
 retain rule_statement ;
 length rule_statement $5000 ;
 set rule_data(where = (rule_id = "r1")) end=eof;
 rule_statement='if rc_term1='||quote(trim(rc_term1))||' and '||'rc_term2='||quote(trim(rc_term2))||' then
do;';
rule_statement=trim(rule_statement)||'ra_term1='||quote(trim(ra_term1))||';'||'ra_term2='||quote(trim(ra_term2))||';'||'ra
_term3='||quote(trim(ra_term3))||';'||' sequence='||strip(sequence)||';output rule_result;end;';
 call symputx('rule_statement'||left(_n_), rule_statement);
 if eof then call symputx('rule_statement_count',_n_);
run;


NOTE: There were 1500 observations read from the data set WORK.RULE_DATA.
      WHERE rule_id='r1';
NOTE: The data set WORK.TEMP has 1500 observations and 9 variables.
NOTE: DATA statement used (Total process time):
      real time           0.34 seconds
      cpu time            0.28 seconds

 

 

我是在一台虚拟机上跑的,机器本身性能一般,从测试结果看性能相差近 8 倍,这里我们给定的 Rule 只有两条,如果每一条 Rule 执行时都差 2 秒多,那么当你的原始数据量很大,Rule 又很多很复杂时,性能差异将感觉非常明显。对于企业级用户而言,有着上千条 Rule 很正常,所以我们一定不能用数据步循环的方法来实现。另外,同事在另一台机器上测试的结果显示宏循环用 0.2 秒,数据步循环用 5 秒,相差 25 倍。

 

下面是全部代码(主要是生成 Rule 的条件判断和赋值语句),共有三个宏:

1)testdata

生成测试数据,测试数据的行数通过宏变量 rule_data_count 来控制。

%let rule_data_count=1500;

 

2)loop

采用宏循环,因为要使用 SAS 的宏引用来处理单引号、括号等特殊符号,所以宏变量赋值部分的代码比较复杂。但是,宏替换完成后生成的数据步代码则很简洁,逻辑清晰。有些时候,性能和代码的简洁程度是成反比的,需要权衡利弊之后再做选择。如果数据量不大,则宁肯代码简洁便于维护,也不写很复杂很有技巧的代码。

 

3) loop_org

采用数据步循环,代码简单易懂,但是宏替换完后生成的数据步中有很多不必要的 if...else 语句,使得代码变得非常低效而且业务逻辑不清晰。

 

%let rule_data_count=1500;
%let rule_count=2;

%let mvar_rc_term1=rc_term1;
%let mvar_rc_term2=rc_term2;
%let mvar_rc_term3=rc_term3;
%let mvar_ra_term1=ra_term1;
%let mvar_ra_term2=ra_term2;
%let mvar_ra_term3=ra_term3;

%let rc_term1=rc_term_value1;
%let rc_term2=rc_term_value2;
%let rc_term3=rc_term_value3;
%let ra_term1=ra_term_value1;
%let ra_term2=ra_term_value2;
%let ra_term3=ra_term_value3;

%let rulecondition1= rule1condition:mvar_rc_term1:mvar_rc_term2;
%let ruleaction1= rule1action:mvar_ra_term1:mvar_ra_term2:mvar_ra_term3;

%let rulecondition2= rule2condition:mvar_rc_term1:mvar_rc_term3;
%let ruleaction2= rule2action:mvar_ra_term1;

%let rule1=r1;
%let rule2=r2;


%macro testdata;
%let sequence=0;
data rule_data;
   length rule_id $ 2;
   %do rule_no=1 %to &rule_count;
      rule_id="&&rule&rule_no";

      %let rc_term = %sysfunc(translate(&&rulecondition&rule_no,%str( ),%str(:)));
      %let rc_count = %eval_r(%sysfunc(count(&rc_term,%str( )))+1);
      %let ra_term = %sysfunc(translate(&&ruleaction&rule_no,%str( ),%str(:)));
      %let ra_count = %eval_r(%sysfunc(count(&ra_term,%str( )))+1);

      %do loop2i=2 %to &rc_count ;
         %let this_member = %scan(&rc_term,&loop2i);
         &&&this_member = "&&&&&&&this_member";
      %end;

      %do loop2i=2 %to &ra_count ;
         %let this_member = %scan(&ra_term,&loop2i);
         &&&this_member = "&&&&&&&this_member";
      %end;

      %do loop2i=1 %to &rule_data_count;
         %let sequence=%eval_r(&sequence+1);
         sequence=&sequence;
         output;
      %end;
   %end;
run;
%mend;


%macro loop_org;
%do rule_no = 1 %to &rule_count;
   %let rc_term = %sysfunc(translate(&&rulecondition&rule_no,%str( ),%str(:)));
   %let rc_count = %eval_r(%sysfunc(count(&rc_term,%str( )))+1);
   %let ra_term = %sysfunc(translate(&&ruleaction&rule_no,%str( ),%str(:)));
   %let ra_count = %eval_r(%sysfunc(count(&ra_term,%str( )))+1);

   data temp;
      retain rule_statement ;
      length rule_statement $5000 ;
      set rule_data(where = (rule_id = "&&rule&rule_no"));

      %do i=2 %to &rc_count ;
         %let this_term = %scan(&rc_term,&i);
         if &i = 2 then do;
            rule_statement = 'if '||"&&&this_term = "||quote(trim(&&&&&this_term));
            if &rc_count = 2 then do;
               rule_statement = trim(left(rule_statement))||' then do ; ';
            end;
         end;
         else if &i = &rc_count then do;
            rule_statement = trim(left(rule_statement))||" and &&&this_term = "||quote(trim(&&&&&this_term))||' then do ; ';
         end;
         else do;
            rule_statement = trim(left(rule_statement))||" and &&&this_term = "||quote(trim(&&&&&this_term));
         end;
      %end;

      %do j=2 %to &ra_count;
         %let this_term = %scan(&ra_term,&j);
         if &j = 2 then do;
            rule_statement = trim(left(rule_statement))||"_&&&this_term = "||quote(trim(&&&&&this_term))||';';
         end;
         else if &j = &ra_count then do;
            rule_statement = trim(left(rule_statement))||"  _&&&this_term = "||quote(trim(&&&&&this_term))||';'
                                         ||' sequence = '||left(sequence)||'; output rule_result; end;';
            call symput ('rule_statement'||left(put(_n_,15.)), rule_statement);
            call symput("rule_statement_count",_n_);
         end;
         else do;
            rule_statement = trim(left(rule_statement))||" _&&&this_term = "||quote(trim(&&&&&this_term))||';';
         end;
      %end;
   run;
%end;
%mend;

 

%macro loop;
%do rule_no = 1 %to &rule_count;
   %let rc_term = %sysfunc(translate(&&rulecondition&rule_no,%str( ),%str(:)));
   %let rc_count = %eval_r(%sysfunc(count(&rc_term,%str( )))+1);
   %let ra_term = %sysfunc(translate(&&ruleaction&rule_no,%str( ),%str(:)));
   %let ra_count    = %eval_r(%sysfunc(count(&ra_term,%str( )))+1);

   %let if_statement = %nrstr(rule_statement=%'if );
   %let delimeter=;
   %do i=2 %to &rc_count ;
      %let this_term = %scan(&rc_term,&i);

      %if &i > 2 %then %let delimeter=%nrstr(||%' and %')%nrstr(||%');
      %let if_statement=&if_statement.&delimeter.&&&&&this_term.%nrstr(=%'||quote%(trim%()&&&&&this_term%nrstr(%)%));

      %if &i = &rc_count %then %let if_statement=&if_statement.%nrstr(||%' then do;%');
   %end;
   %put &if_statement;

   %let action_statement = %nrstr(rule_statement=trim%(rule_statement%)||%');
   %let delimeter=;
   %do i=2 %to &ra_count ;
      %let this_term = %scan(&ra_term,&i);

      %if &i > 2 %then %let delimeter=%nrstr(%'||%');
      %let action_statement=&action_statement.&delimeter.&&&&&this_term.%nrstr(=%'||quote%(trim%()&&&&&this_term%nrstr(%)%)||%';);

      %if &i = &ra_count %then %let action_statement=&action_statement.%nrstr(%'||%' sequence=%'||strip%(sequence%)||%';output rule_result;end;%');
   %end;
   %put &action_statement;

   data temp;
      retain rule_statement ;
      length rule_statement $5000 ;
      set rule_data(where = (rule_id = "&&rule&rule_no")) end=eof;
      %unquote(&if_statement);
      %unquote(&action_statement);
      call symputx('rule_statement'||left(_n_), rule_statement);
      if eof then call symputx('rule_statement_count',_n_);
   run;
%end;
%mend;


options mprint;
%testdata;
%loop;
%loop_org;

0

阅读 收藏 喜欢 打印举报/Report
  

新浪BLOG意见反馈留言板 欢迎批评指正

新浪简介 | About Sina | 广告服务 | 联系我们 | 招聘信息 | 网站律师 | SINA English | 产品答疑

新浪公司 版权所有