数据挖掘——决策树
否存在足够的样本。
(2)ID3算法只能处理分类属性(离散属性),而不能处理连续属性(数值属性)。在处理连续属性时,一般要先将连续属性划分为多个区间,转化为分类属性。例如“年龄”,要把其数值事先转换为诸如“小于30岁”、“30~50岁”、“大于50岁”这样的区间,再根据年龄值落入了某一个区间取相应的类别值。通常,区间端点的选取包含着一定的主观因素和大量的计算。
(3)ID3算法生成的决策树是一棵多叉树,分支的数量取决于分裂属性有多少个不同的取值。这不利于处理分裂属性取值数目较多的情况。因此目前流行的决策树算法大多采用二叉树模型。
(4)ID3算法不包含对树的修剪,无法对决策树进行优化。
5. 信息熵
自信息量只能反映符号的不确定性,而信息熵可以用来度量整个信源X整体的不确定性。设某事物具有n种相互独立的可能结果(或称状态): ,每一种结果出现的概率分别为 且有: ,那么,该事物所具有的不确定量为:
?X??p?x1?I?x1??p(x2)I(x2)...???p?xi?I?xi? H i?1np和n分别为正面集合反面集(类标号属性各个值的集合) 一棵决策树能对一个例子做出正确类别判断所需的信息量如下公式所示:
ppnnlog2?log2 I(p,n)??p?np?np?np?n
vpi?ni如果以属性A为分裂属性,A具有v个值,则A的期望为:E (A)?I(pi,ni)i?1p?n
因此,以A为根的信息增益如下公式所示: gain(A)?I(p,n)?E(A)?
六、结论
数据库和数据仓库技术的迅猛发展,使得存储的信息量日益增加,数据挖掘技术正是在此基础上应运而生的,而且数据挖掘技术也越来越受到国内外许多学者的广泛关注。本文首先简单介绍了数据挖掘的现状以及其基本过程,然后介绍了数据挖掘中所用的基本方法,接着详细介绍了ID3。
决策树算法是基于信息熵理论的有效的算法。它运用了交叉确认的模型验证方法。本文通过对数据的分析,利用决策树理论以及相关性分析,对初选的数据进行筛选,得到比较科学合理的数据挖掘结构,然后再对决策树进行构建和修剪,得到分类规则,进而得出综合评价。决策树算法对数据分布无任何要求, 应用于银行和金融的效果也比较好, 因此具有良好的发展前景, 值得我们深入研究。
七、决策树代码
代码: 创建类型:
CREATE TYPE CNode AS OBJECT( nview VARCHAR(255),
9
-- Node view name
数据挖掘——决策树
rule VARCHAR(30), -- Node rule entrop NUMBER(10,8), -- Node entropy pop NUMBER(5)); -- Node population /
-- List of Nodes in Candidate list
CREATE TYPE CNodeList AS TABLE OF CNode; /
-- Candidate list (relational table)
CREATE TABLE BTCandidate( att_name VARCHAR(30) PRIMARY KEY, -- Attribute name gain NUMBER(10,8), -- Information gain nodes CNodeList) -- List of associated nodes NESTED TABLE nodes STORE AS BTNode; 创建过程:
CREATE TYPE CNode AS OBJECT( nview VARCHAR(255), -- Node view name rule VARCHAR(30), -- Node rule entrop NUMBER(10,8), -- Node entropy pop NUMBER(5)); -- Node population /
-- List of Nodes in Candidate list
CREATE TYPE CNodeList AS TABLE OF CNode; /
-- Candidate list (relational table)
CREATE TABLE BTCandidate( att_name VARCHAR(30) PRIMARY KEY, -- Attribute name gain NUMBER(10,8), -- Information gain nodes CNodeList) -- List of associated nodes NESTED TABLE nodes STORE AS BTNode;
-- Input parameters: table_name: source table name -- class: class attribute name -- res_name: result table name
-- min_gain: (strict) minimum information gain for node building -- root_view: root node view name -- del: clean-up views after execution (True/False) --
CREATE OR REPLACE PROCEDURE BuildTree( table_name VARCHAR,
10
数据挖掘——决策树
class VARCHAR, res_name VARCHAR DEFAULT 'BTRES',
min_gain REAL DEFAULT 0, root_view VARCHAR DEFAULT 'BTROOT', del BOOLEAN DEFAULT TRUE) AUTHID CURRENT_USER AS --
-- Data types declaration -- -- Node TYPE Node IS RECORD( num INTEGER, -- Node number nview VARCHAR(255), -- Node view name rule VARCHAR(30), -- Node rule entrop REAL, -- Node entropy pop INTEGER); -- Node population strength
TYPE NodeList IS TABLE OF Node; -- List of nodes
TYPE StringList IS TABLE OF VARCHAR(255); -- List of strings
TYPE DynCursor IS REF CURSOR; -- Dynamic cursor --
-- Variables declaration --
stack NodeList := NodeList(); -- Stack of nodes (empty) cnode Node; -- Current node att_names DynCursor; -- List of attribute names catt VARCHAR(30); -- Current attribute name att_values DynCursor; -- List of current attribute values cval VARCHAR(30); -- Current value egain REAL; -- Information gain nnode Node; -- New node viewq VARCHAR(1000); -- View creation query att_names2 DynCursor; -- List of attribute names att_name VARCHAR(30); -- Attribute name candq VARCHAR(1000); -- Candidate insertion query maxg REAL; -- Maximum information gain max_var VARCHAR(30); -- Variable associated to maximum information gain candidates DynCursor; -- Candidate list cvar VARCHAR(30); -- Attribute in candidate list elist CNodeList; -- List of nodes in candidate list
delist StringList := StringList(); -- List of nodes whose views must be deleted (empty)
11
数据挖掘——决策树
resq VARCHAR(1000); -- Result query
classval StringList := StringList(); -- Class attribute values list nodenum INTEGER := 0; -- Number of nodes in tree
i BINARY_INTEGER; j BINARY_INTEGER; --
-- Entropy and population computation procedure --
PROCEDURE Entropy(view_name IN VARCHAR, -- Name of view associated to the treated node ent OUT REAL, -- Entropy value
totpop OUT INTEGER) -- Population total IS
TYPE PopStrList IS TABLE OF INTEGER; -- Type table of population strengths popstr PopStrList := PopStrList(); -- Population strengths table cstr INTEGER; -- Current population strength curs DynCursor; -- Cursor for counting population i BINARY_INTEGER; -- Loop index
BEGIN
-- Init. ent := 0; totpop := 0;
-- Count population strengths and store them in popstr -- Total population strength is computed on the fly OPEN curs FOR
'SELECT COUNT(*) FROM ' || view_name || ' GROUP BY ' || class; LOOP
FETCH curs INTO cstr;
EXIT WHEN curs%NOTFOUND; popstr.EXTEND;
popstr(popstr.COUNT) := cstr; totpop := totpop + cstr; /* Debug
DBMS_OUTPUT.PUT_LINE('popstr('||TO_CHAR(popstr.COUNT)||')='||TO_CHAR(popstr(popstr.COUNT)));
DBMS_OUTPUT.PUT_LINE('totpop='||TO_CHAR(totpop)); */
12
数据挖掘——决策树
END LOOP; CLOSE curs;
-- Compute entropy
FOR i IN 1..popstr.COUNT LOOP
ent := ent - (popstr(i)/totpop)*LOG(2,(popstr(i)/totpop)); END LOOP;
ent := ROUND(ent,8); -- Precision adaptation /* Debug
DBMS_OUTPUT.PUT_LINE('Entropy = '||TO_CHAR(ent));
DBMS_OUTPUT.PUT_LINE('Population = '||TO_CHAR(totpop)); */
popstr.TRIM(popstr.COUNT); -- Clean table
END; --
-- Result output procedure --
PROCEDURE Result( num INTEGER, -- Node number parent VARCHAR, -- Parent node number or NULL rule VARCHAR, -- Building rule view_name VARCHAR) -- Node associated view IS
curs DynCursor; -- Cursor for counting population resq VARCHAR(255); -- Result query v VARCHAR(30); -- Class attribute value p INTEGER; -- Associated population i BINARY_INTEGER; j BINARY_INTEGER;
BEGIN
-- Query init.
resq := 'INSERT INTO '||res_name||' VALUES('||TO_CHAR(num)||', '||parent||', '; IF rule <> 'NULL' THEN resq := resq || '''' || rule || ''''; ELSE
resq := resq || 'NULL'; END IF;
13