L{i}=strcat(L{i},tline); end end
fclose(fid);
%统计每种碱基的频率
P1=[];
for i=1:size(L,2) %统计每种碱基个数特征 tline=L{i};
P1(i,1)=length(find(tline(:)=='a')); P1(i,2)=length(find(tline(:)=='t')); P1(i,3)=length(find(tline(:)=='c')); P1(i,4)=length(find(tline(:)=='g')); P1(i,:)=P1(i,:)./sum(P1(i,:)); end
Q1=[];%表示两两碱基之间的频率比
for i=1:2 %数据处理顺序主要考虑与人工处理相一致 for j=i+1:4
temp=P1(:,j)./P1(:,i); Q1=[Q1 temp]; end
end
temp=P1(:,3)./P1(:,4); Q1=[Q1 temp];
%以下计算自然序列与人工序列的平均值之间的距离,以进行筛选分类 aa=[ 0.1393 0.5311 1.5172 3.2803]; bb=[ 0.5020 1.8300 0.2618 0.2131]; vec=[P1(:,[2 ]),Q1(:,[1 4 5 ])];a_type=[];b_type=[]; for i=1:182
aia= sum(abs(vec(i,:)-aa)./(vec(i,:)+aa)); bib=sum(abs(vec(i,:)-bb)./(vec(i,:)+bb)); if aia b_type=[b_type i]; end end b_type=sort(b_type) a_type=sort(a_type) 附件8:用马尔可夫法对编号21-40人工DNA序列以及182个自然序列进行分 类的程序 (1) 对编号21-40人工DNA序列进行分类的程序 function marl_jianyan clear L N=20; fid = fopen('Art-model-data.txt','r'); i=1; while 1 %该循环将txt文件的数据存入 L中 tline = fgetl(fid); if ~ischar(tline), break, end if strcmp(tline,'') continue end if strcmp(tline(2),'.') L{str2num(tline(1))}=tline(3:end); elseif strcmp(tline(3),'.') L{str2num(tline(1:2))}=tline(4:end); end end fclose(fid);a_trans=zeros(4);b_trans=zeros(4); % 每条序列4种碱基的频率 for i=1:10 tline=L{i}; ka=find(tline(:)=='a'); kt=find(tline(:)=='t'); kc=find(tline(:)=='c'); kg=find(tline(:)=='g'); temp=[];temp(ka)=1;temp(kt)=2;temp(kc)=3;temp(kg)=4; for j=1:length(temp)-1 a_trans(temp(j),temp(j+1))= a_trans(temp(j),temp(j+1))+1; end end a_trans=a_trans./(sum(a_trans')'*ones(1,4)); %A类马尔科夫概率转移矩阵 for i=11:20 tline=L{i}; ka=find(tline(:)=='a'); kt=find(tline(:)=='t'); kc=find(tline(:)=='c'); kg=find(tline(:)=='g'); temp=[];temp(ka)=1;temp(kt)=2;temp(kc)=3;temp(kg)=4; for j=1:length(temp)-1 b_trans(temp(j),temp(j+1))= b_trans(temp(j),temp(j+1))+1; end end b_trans=b_trans./(sum(b_trans')'*ones(1,4)); %B类马尔科夫概率转移矩阵 a_trans; b_trans; a_type=[];b_type=[];PA=[];PB=[];ee_a=[];ee_b=[]; %A、B类内部检验 for i=1:N tline=L{i}; rubi=find(tline=='n'|tline=='s'|tline=='r'|tline=='w'|tline=='y'); %剔除异常碱基序列 if ~isempty(rubi) kk=setdiff(1:length(tline),rubi); tline=tline(kk); end ka=find(tline(:)=='a'); kt=find(tline(:)=='t'); kc=find(tline(:)=='c'); kg=find(tline(:)=='g'); temp=[];temp(ka)=1;temp(kt)=2;temp(kc)=3;temp(kg)=4; pa=1;pb=1;zhishu_a=0;zhishu_b=0; %将指数和系数分离 for j=1:length(temp)-1 pa=pa*a_trans(temp(j),temp(j+1)); pb=pb*b_trans(temp(j),temp(j+1)); if pa<0.1& pa>0.01 pa=pa*10; zhishu_a=zhishu_a+1; elseif pa<0.01 &pa>1e-3 pa=pa*100; zhishu_a=zhishu_a+2; end if pb<0.1 & pb>.01 pb=pb*10; zhishu_b=zhishu_b+1; elseif pb<.01 pb=pb*100; zhishu_b=zhishu_b+2; end pa=round(pa*100)/100;%保留两位有效数字 pb=round(pb*100)/100; end ee_a=[ee_a zhishu_a]; %记录数据 ee_b=[ee_b zhishu_b]; PA=[PA pa]; PB=[PB pb]; end ee_ab=ee_a-ee_b; det_ab=PA-PB.*10.^(ee_ab); %b_type =find(ee_a-ee_b>10); %a_type =find(ee_b-ee_a>10); aa=find(det_ab>0); bb=find(det_ab<0); log_a12=[];log_b12=[]; %以下为取对数的数据检验 for i=1:N log_a12=[log_a12,log10(PA(i))-ee_a(i)]; log_b12=[log_b12,log10(PB(i))-ee_b(i)]; end i1=[1:3,5:10];i2=[4,11:20]; mean_a1=mean(log_a12(i1)) sig_a1=std(log_a12(i1)) mean_a2=mean(log_a12(i2)) sig_a2=std(log_a12(i2)) r_a=(sig_a1+sig_a2)/abs(mean_a1-mean_a2) mean_b1=mean(log_b12(i1)) sig_b1=std(log_b12(i1)) mean_b2=mean(log_b12(i2)) sig_b2=std(log_b12(i2)) r_b=(sig_b1+sig_b2)/abs(mean_b1-mean_b2) %A_num=PA.*10.^(ee_a) %B_num=PB.*10.^(ee_b) %PA %PB ?_a ?_b log_a12 log_b12 better_r_a=(sig_a1+sig_b1)/abs(mean_a1-mean_b1) better_r_b=(sig_a2+sig_b2)/abs(mean_a2-mean_b2) (2) 对182个自然序列进行分类的程序 clear L fid = fopen('Nat-model-data.txt','r'); while 1 %该循环将txt文件的数据存入 L中 tline = fgetl(fid); if ~ischar(tline), break, end if strcmp(tline,'') continue end if length(tline)>4 if strcmp(tline(2),':') i=str2num(tline(1)); L{i}=tline(4:end); elseif strcmp(tline(3),':') i=str2num(tline(1:2)); L{i}=tline(5:end); elseif strcmp(tline(4),':') i=str2num(tline(1:3)); L{i}=tline(6:end); end end if strcmp(tline(1),'a') | strcmp(tline(1),'t') | strcmp(tline(1),'c') | strcmp(tline(1),'g') L{i}=strcat(L{i},tline); end end fclose(fid); a_trans =[ 0.3662 0.1911 0.1911 0.2516 0.2456 0.3275 0.1696 0.2573 0.2359 0.1385 0.0974 0.5282 0.2601 0.0644 0.2029 0.4726]; b_trans=[ 0.3707 0.4081 0.1184 0.1028 0.2701 0.5931 0.0858 0.0511 0.2182 0.4727 0.1636 0.1455 0.3063 0.3964 0.0811 0.2162]; a_type=[];b_type=[];PA=[];PB=[]; ee_a=[];ee_b=[]; rubi=[]; rubirubi=[]; %A、B类内部检验 for i=1:182 tline=L{i}; rubi=find(tline=='n'|tline=='s'|tline=='r'|tline=='w'|tline=='y'); %剔除异常碱基序列 if ~isempty(rubi) kk=setdiff(1:length(tline),rubi); tline=tline(kk); end ka=find(tline(:)=='a'); kt=find(tline(:)=='t'); kc=find(tline(:)=='c'); kg=find(tline(:)=='g'); temp=[];temp(ka)=1;temp(kt)=2;temp(kc)=3;temp(kg)=4; pa=1;pb=1;zhishu_a=0;zhishu_b=0; %将指数和系数分离 for j=1:length(temp)-1 pa=pa*a_trans(temp(j),temp(j+1)); pb=pb*b_trans(temp(j),temp(j+1)); if pa<0.1& pa>0.01 pa=pa*10; zhishu_a=zhishu_a+1; elseif pa<0.01 &pa>1e-3 pa=pa*100; zhishu_a=zhishu_a+2; end if pb<0.1 & pb>.01 pb=pb*10; zhishu_b=zhishu_b+1; elseif pb<.01 pb=pb*100; zhishu_b=zhishu_b+2; end pa=round(pa*100)/100;%保留两位有效数字 pb=round(pb*100)/100; end ee_a=[ee_a zhishu_a]; %记录数据 ee_b=[ee_b zhishu_b]; PA=[PA pa]; PB=[PB pb]; end ee_ab=ee_a-ee_b; det_ab=PA-PB.*10.^(ee_ab); %b_type =find(ee_a-ee_b>10); %a_type =find(ee_b-ee_a>10); aa=find(det_ab>0) bb=find(det_ab<0)