|
楼主

楼主 |
发表于 2013-8-5 07:45:42
|
只看该作者
[转] SAS决策树:二维空间的决策边界显示
在《数据挖掘导论》中看到斜决策树的介绍,如图-1所示,数据中存在着线性不等式的模式:
当 x1+x2<1 时,取值为0;否则,取值为1。
[code:27ws7y78]data source;
seed=12345;
do i=1 to 100;
x1=ranuni(seed);
x2=ranuni(seed);
if x1+x2>=1 then target=1;
else target=0;
output;
end;
run;
proc sgplot data=source;
scatter x=x1 y=x2/group=target;
lineparm x=1 y=0 slope=-1;
run;quit;[/code:27ws7y78]
接下来,我们使用 SAS EM 提供的 PROC ARBOR 生成一棵决策树。
[code:27ws7y78]proc arbor data=source criterion=ENTROPY splitsize=2 maxbranch=2;
target target/level = binary;
input x1 x2 /level = interval;
subtree largest;
save NODESTATS=nodstat1 RULES=rul1 ;
score out=scoreout;
run;quit; [/code:27ws7y78]
想要通过SAS编程的方式显示决策树并不是很容易的事情,而且显示的效果也一般,好在有热心的网友在之前做了一些研究,这里我就直接借用他的代码,当然遇到了一点显示的问题,我对他的源代码做了一点小改动。
我借鉴的代码来自:http://www.sasanalysis.com/2011/02/visualize-decision-tree-by-coding-proc.html,作者是位资深的SAS用户,非常活跃,是分析方面的专家。
[code:27ws7y78]******** VISUALIZE DECISION TREE RESULTS************;
proc sql;
create table treedata as
select a.parent as act1, a.node, b.NODETEXT, b.U_Target
from nodstat1 as a, nodstat1 as b
where a.parent=b.node
union
select c.node as act1, . as node, c.nodetext, c.U_Target
from nodstat1 as c
;
quit;
data treedata1;
set treedata;
if U_Target=1 then _pattern=1;
else _pattern=2;
nodetext = compress(nodetext,'0920'x);
nodetext = tranwrd(nodetext,':',': ');
run;
*NOTE: USE PROC NETDRAW TO REALIZE PHYSICAL TREE*;
pattern1 c=green; pattern2 v=s c=red;
footnote c=green '1 ' c=red '0 ';
proc netdraw data=treedata1 graphics;
actnet /activity=act1 successor=NODE nolabel id=(NODETEXT)
tree compress rotate rotatetext arrowhead=0
font=simplex ctext=white htext=2;
run;
footnote ' ';[/code:27ws7y78]
最后,我们使用 SAS/GRAPH 提供的 Annotate 功能来画决策边界,这里的决策边界是从决策树的结果表中人工提取的,我没有花时间研究如何通过代码自动生成。
[code:27ws7y78]data rules1;
set rul1;
where role='PRIMARY' and (stat='VARIABLE' or stat='INTERVAL');
run;
data decbounderies;
set rules1;
name=lag(character_value);
value=numeric_value;
if stat='INTERVAL' then output;
keep node name value;
run;
data Lines;
infile datalines dlm='#';
length label $ 27
textcolor $ 9
linecolor $ 9;
retain x1space 'datavalue';
retain y1space 'datavalue';
retain x2space 'datavalue';
retain y2space 'datavalue';
input function $ x1 y1 label x2 y2 textcolor linecolor;
datalines;
line # 0.4215868129 # 0 # # 0.4215868129 # 1 # # blue
line # 0 # 0.7397564595 # # 0.4215868129 # 0.7397564595 # # blue
line # 0.1081274502 # 0.7397564595 # # 0.1081274502 # 1 # # blue
line # 0.4215868129 # 0.2146758047 # # 1 # 0.2146758047 # # blue
line # 0.9196328702 # 0 # # 0.9196328702 # 0.2146758047 # # blue
line # 0.4215868129 # 0.4578804769 # #1 # 0.4578804769 # # blue
line # 0.6309236754 # 0.2146758047 # # 0.6309236754 # 0.4578804769 # # blue
;
run;
proc sgplot data=source sganno=Lines;
scatter x=x1 y=x2/group=target;
run;quit; [/code:27ws7y78]
原帖:http://blog.sina.com.cn/s/blog_8db50cf70101i3vk.html |
|