见文本文档Nat-model-data.txt
附件3:A类和B类样品单个碱基丰度的计算和散点图的绘制程序 clear L
fid = fopen('ArT-model-daTa.TxT','r'); i=1;
while 1 %该循环将TxT文件的数据存入 L中 Tline = fgeTl(fid);
if ~ischar(Tline), break, end if sTrcmp(Tline,'') conTinue end
if sTrcmp(Tline(2),'.')
L{sTr2num(Tline(1))}=Tline(3:end); elseif sTrcmp(Tline(3),'.')
L{sTr2num(Tline(1:2))}=Tline(4:end); end end
fclose(fid);P=[]; % 每条序列4种碱基的频率 for i=1:size(L,2) %统计每种碱基个数特征 Tline=L{i};
P(i,1)=lengTh(find(Tline(:)=='a')); P(i,2)=lengTh(find(Tline(:)=='T')); P(i,3)=lengTh(find(Tline(:)=='c')); P(i,4)=lengTh(find(Tline(:)=='g')); P(i,:)=P(i,:)./sum(P(i,:)); end
ave_a=mean(P(1:10,:)); sig_a=sTd(P(1:10,:)); ave_b=mean(P(11:20,:)); sig_b=sTd(P(11:20,:));
a_lei=[ave_a-3*sig_a;ave_a+3*sig_a]; b_lei=[ave_b-3*sig_b;ave_b+3*sig_b]; subploT(2,2,1)
ploT(1:10,P(1:10,1),'*',11:20,P(11:20,1),'.') hold on
ploT(1:20,a_lei(1,1)*ones(1,20),'b',1:20,a_lei(2,1)*ones(20,1),'b',1:20,b_lei(1,1)*ones(1,20),'g',1:20,b_lei(2,1)*ones(20,1),'g') xlabel('碱基a') subploT(2,2,2)
ploT(1:10,P(1:10,2),'*',11:20,P(11:20,2),'.') hold on
ploT(1:20,a_lei(1,2)*ones(1,20),'b',1:20,a_lei(2,2)*ones(20,1),'b',1:20,b_lei(1,2)*ones(1,20),'g',1:20,b_lei(2,2)*ones(20,1),'g')
xlabel('碱基T') subploT(2,2,3)
ploT(1:10,P(1:10,3),'*',11:20,P(11:20,3),'.') hold on
ploT(1:20,a_lei(1,3)*ones(1,20),'b',1:20,a_lei(2,3)*ones(20,1),'b',1:20,b_lei(1,3)*ones(1,20),'g',1:20,b_lei(2,3)*ones(20,1),'g') xlabel('碱基c') subploT(2,2,4)
ploT(1:10,P(1:10,4),'*',11:20,P(11:20,4),'.') hold on
ploT(1:20,a_lei(1,4)*ones(1,20),'b',1:20,a_lei(2,4)*ones(20,1),'b',1:20,b_lei(1,4)*ones(1,20),'g',1:20,b_lei(2,4)*ones(20,1),'g') xlabel('碱基g')
%max_ab=max([a_lei(:,[2 4]);b_lei(:,[2 4])]); %min_ab=min([a_lei(:,[2 4]);b_lei(:,[2 4])]);
%[row col]=find(P(:,[2 4])>ones(size(P(:,[2 4]),1),1)*max_ab | P(:,[2 4]) a1=find((P(:,2)>ones(size(P(:,2),1),1)*a_lei(1,2)) & (P(:,2) a2=find((P(:,4)>ones(size(P(:,4),1),1)*a_lei(1,4)) & (P(:,4) b1=find((P(:,2)>ones(size(P(:,2),1),1)*b_lei(1,2)) & (P(:,2) b2=find((P(:,2)>ones(size(P(:,2),1),1)*b_lei(1,2)) & (P(:,2) 附件4:A类和B类样品不同碱基丰度之比的计算和散点图的绘制程序 funcTion bili global P global Q Q=[]; for i=1:3 for j=i+1:4 Temp=P(:,j)./P(:,i); Q=[Q Temp]; end end for j=1:6 subploT(3,2,j) i1=1:10;i2=11:20; ploT(i1,Q(i1,j),'*r',i2,Q(i2,j),'+b') legend('A类','B类') xlabel(sprinTf('%d%',j)) ylabel('f(T)/f(A)') end 附件5:编号1-40样品的不同特征变量值统计表 表1 1-40号样品特征变量值统计表 碱基T碱基G碱基C编号 碱基T碱基G与碱基与碱基与碱基的丰度 的丰度 A的丰A的丰T的丰度之比 度之比 度之比 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 0.1351 0 .3964 0 .4545 1 .3333 0.1532 0 .4144 0 .5667 1 .5333 0.0631 0 .4505 0.2333 1 .6667 0.2883 0 .1802 0 .6809 0 .4255 0.1081 0 .4234 0 .4615 1 .8077 0.1261 0 .3964 0 .3590 1 .1282 0.1892 0 .3604 0 .5385 1 .0256 0.1892 0 .3694 0 .6774 1 .3226 0.1532 0.4324 0.1364 0 .4091 0.7391 2.0870 0.7500 2.2500 1.2667 1.0588 3.4286 0.3750 2.1667 1.0000 0.5238 0.8571 1.3529 2.0000 0.0909 0.0545 0.1930 0.1636 0 0.1765 0.9310 0.2364 0.2581 0.3065 0.5366 1.0870 1.3684 0.4681 1.0435 0.5455 0.8750 0.3269 1.1579 0.9630 0.5000 0.6944 碱基G与碱基T的丰度之比 2.9333 2.7059 7.1429 0.6250 3.9167 3.1429 1.9048 1.9524 2.8235 3.0000 0.2000 0.2909 0.2456 0.2364 0.0986 0.1961 0.5172 0.1818 0.1290 0.1129 0.4634 1.1304 2.0526 0.4681 1.3913 0.4773 1.4583 0.3462 2.3684 0.8519 0.6250 0.8056 0.5000 0 .1000 1 .4103 0 .2821 0.5000 0 .1455 1 .5278 0.4444 0.5182 0 .1273 2 .0357 0 .5000 0.5000 0 .1182 1 .6667 0 .3939 0.6455 0 .0636 2 .2188 0 .2188 0.4636 0 .0909 1 .2750 0 .2500 0.2636 0 .1364 0 .7436 0 .3846 0.5000 0 .0909 1 .7188 0 .3125 0.5636 0 .0727 2.5833 0 .3333 0.5636 0 .0636 2 .8182 0 .3182 0.3628 0 .1681 1 .3226 0 .6129 0.2212 0 .2500 0 .7667 0 .8667 0.1863 0 .3824 1 .0556 2 .1667 0.4087 0 .1913 1 .9583 0 .9167 0.2190 0 .3048 0 .8846 1 .2308 0.3860 0 .1842 1 .7600 0 .8400 0.2308 0 .3365 1 .0000 1 .4583 0.4444 0 .1538 1 .7333 0 .6000 0.1881 0 .4455 1 .2667 3 .0000 0.2523 0 .2150 0 .8710 0 .7419 0.3571 0 .2232 1 .4815 0 .9259 0.3303 0 .2661 1 .8947 1 .5263 33 34 35 36 37 38 39 40 0.3333 0 .2072 1 .2333 0 .7667 0.1667 0 .3627 0 .7083 1 .5417 0.2039 0 .3398 0 .8400 1 .4000 0.2095 0 .2571 0 .9167 1 .1250 0.2039 0 .3301 0 .9545 1 .5455 0.4359 0 .1709 1 .9615 0 .7692 0.2358 0 .2075 0 .8621 0 .7586 0.4310 0 .1724 2 .1739 0 .8696 0.5676 1.4118 1.0476 1.4545 1.2381 0.3922 1.2000 0.4600 0.6216 2.1765 1.6667 1.2273 1.6190 0.3922 0.8800 0.4000 附件6:求解编号1-20样品两两之间Lance和Williams距离的程序 function julei_analyse global P global Q i=1:40; vec=[P(i,[2 4 ]),Q(i,[1 5 ])]; Len=zeros(20); for i=1:20 for j=1:20 temp=sum(abs(vec(i,:)-vec(j,:))./(vec(i,:)+vec(j,:))); Len(i,j)=temp; end end [II1 II2]=find(Len==max(max(Len))); c_type=[];d_type=[];c=[];d=[]; c_type=[c_type II1(1)];d_type=[d_type II1(2)]; kik=[];kik=setdiff(1:20,[II1(1) , II1(2)]); while ~isempty(kik) %A,B类内部之间的聚类分析 c=mean(vec(c_type,:)); d=mean(vec(d_type,:)); min_c=100000;min_d=100000; for i=kik tom=sum(abs(vec(i,:)-c)./(vec(i,:)+c)); if tom<=min_c; min_c=tom; j=i; end end c_type=[c_type j];kik=setdiff(kik,j); for i=kik tom=sum(abs(vec(i,:)-d)./(vec(i,:)+d)); if tom<=min_d; min_d=tom; j=i; end end d_type=[d_type j];kik=setdiff(kik,j); end c %B类数据向量平均值 d %A类数据向量平均值 for i=21:40 cic= sum(abs(vec(i,:)-c)./(vec(i,:)+c)); did=sum(abs(vec(i,:)-d)./(vec(i,:)+d)); if cic>did d_type=[d_type i]; else c_type=[c_type i]; end end c_type=sort(c_type); d_type=sort(d_type); a_mid=mean(vec(d_type,:))%人工序列四十组的均值 b_mid=mean(vec(c_type,:)) vec1=[P(:,[2 4]) Q(:,[1 5 ])]; Len(II1(1) , II1(2)) 附件7:用聚类分析延拓法求解分类结果的程序 clear L fid = fopen('Nat-model-data.txt','r'); while 1 %该循环将txt文件的数据存入 L中 tline = fgetl(fid); if ~ischar(tline), break, end if strcmp(tline,'') continue end if length(tline)>4 if strcmp(tline(2),':') i=str2num(tline(1)); L{i}=tline(4:end); elseif strcmp(tline(3),':') i=str2num(tline(1:2)); L{i}=tline(5:end); elseif strcmp(tline(4),':') i=str2num(tline(1:3)); L{i}=tline(6:end); end end if strcmp(tline(1),'a') | strcmp(tline(1),'t') | strcmp(tline(1),'c') | strcmp(tline(1),'g')