function dna_data= SimulateRecombRateVar4Taxa(varargin) % SimulateRecombRateVar4Taxa -- Synthetic recombination % and rate variation. Simulates an alignment of 4 DNA sequences % with two recombinant regions and one differently diverged region. % On terminating the execution of this program, the following % manual modifications are required to get the file into % Bambe format: % Replace 1.0000000e+00 --> A (3 blanks leading) % Replace 2.0000000e+00 --> C (3 blanks leading) % Replace 3.0000000e+00 --> G (3 blanks leading) % Replace 4.0000000e+00 --> T (3 blanks leading) % Add names to the species: Strain_1, Strain_2 etc. % Header line: must contain 2 integers for the number of strains % and the length of the alignment. % If the default values are used, this will be: 4 1000 % Change from version 1: % - new parameter: random seed % - Specify minimum and maximum branch lengths wMin and wMax. % - Different defaults % % % INPUT % N number of nucleotides % n1 begin of first recombination event: % A0 <--> B0 % n2 begin of second recombination event: % A00 <--> B00 % n3 begin of the differently diverged region % % nlengthA length of the 1st recombination events % nlengthB length of the 2nd recombination events % nlengthC length of the differently diverged region % tt_ratio transition-transversion ratio % wMin minimum branch length % wMax maximum branch length % factor_w factor on w for the differently diverged region % prf1 fraction of mutations before 1st % recombination (along outer branches) % (prf= pre-recombination fraction) % prf2 Idem for 2nd recombination % % OUTPUT % dna_data Synthetic multiple DNA sequence alignment % % FUNCTION CALLS % --> Mutation, Kimura2PM, SampleFromMultinomial % (all in SERAD) % % EXAMPLES % dna_data= SimulateRecombRateVar4Taxa (this uses default settings) % OR % dna_data= SimulateRecombRateVar4Taxa('N',N,'n1',n1,'n2',n2,'n3',n3,'nlengthA',nlengthA,'nlengthB',nlengthB,'nlengthC',nlengthC,'wMin',wMin,'wMax',wMax,'fator_w',factor_w,'randSeed',randSeed,'prf1',prf1,'prf2',prf2) % % A reminder of the topologies: % % % % A0 % l--------- % l % l % A l % l--------l % l l % l l % l l % l l A1 % l l--------- % l % l % R l % -------l % l % l % l % l B0 % l l--------- % l l % l l % l B l % l--------l % l % l % l % l B1 % l--------- % % % % % 1st recombination event: A0 <--> B0 % 2nd recombination event: A0 <--> B1 % % --------------------------------------- % Fixed specifications % --------------------------------------- tt_ratio= 2; % tt ratio % --------------------------------------- % Default specifications % --------------------------------------- N=1000; % number of nucleotides n1=200; % begin of first recombination event n2=500; % begin of second recombination event n3=800; % begin of the differently diverged region nlengthA=100; % length of the 1st recombination event nlengthB=100; % length of the 2nd recombination event nlengthC=100; % length of the differenly diverged region wMin=0.075; % minimum branch length wMax=0.125; % maximum branch length factor_w=3; % factor for hotspot region randSeed=0; % Rand seed; default 0 means no explicit seeding. % fraction of mutations before recombination, first event prf1=0.5; % fraction of mutations before recombination, second event prf2=0.75; % --------------------------------------- % Reading in parameters % --------------------------------------- args = varargin; nargs = length(args); for i=1:2:nargs switch args{i}, case 'N', N = args{i+1}; case 'n1', n1= args{i+1}; case 'n2', n2= args{i+1}; case 'n3', n3= args{i+1}; case 'nlengthA', nlengthA= args{i+1}; case 'nlengthB', nlengthB= args{i+1}; case 'nlengthC', nlengthC= args{i+1}; case 'wMin', wMin= args{i+1}; case 'wMax', wMax= args{i+1}; case 'factor_w',factor_w= args{i+1}; case 'randSeed',randSeed= args{i+1}; case 'prf1',prf1= args{i+1}; case 'prf2',prf2= args{i+1}; otherwise, error('Wrong argument, possibly a mistyped keyword'); end end if prf1<=0 | prf2>1 | prf1>prf2 error('This is wrong: prf1<=0 OR prf2>=1 OR prf1>prf2') end qrf1=prf2-prf1; qrf2=1-prf2; if wMin<0.000001 error('wMin must be positve'); end if wMax<=wMin error('wMax must be greater than wMax'); end if randSeed ~= 0 rand('seed',randSeed); end I_am_at_the_following_level=0 w= wMin+rand*(wMax-wMin); % --------------------------------------- % Root node % --------------------------------------- for i=1:N node_root(i)=SampleFromMultinomial([0.25 0.25 0.25 0.25]); end I_am_at_the_following_level=1 w= wMin+rand*(wMax-wMin); % --------------------------------------- % First-layer nodes % --------------------------------------- for i=1:N node_A(i)=Mutation(node_root(i),w,tt_ratio); node_B(i)=Mutation(node_root(i),w,tt_ratio); end % --------------------------------------------------------- % First-layer nodes: differently diverged region % --------------------------------------------------------- for i=n3:n3+nlengthC-1 node_A(i)=Mutation(node_root(i),factor_w*w,tt_ratio); node_B(i)=Mutation(node_root(i),factor_w*w,tt_ratio); end I_am_at_the_following_level=2 w1= wMin+rand*(wMax-wMin); w2= wMin+rand*(wMax-wMin); w3= wMin+rand*(wMax-wMin); w4= wMin+rand*(wMax-wMin); % --------------------------------------------------------- % Second-layer nodes before 1st recombination % --------------------------------------------------------- for i=1:N node_A0(i)=Mutation(node_A(i),prf1*w1,tt_ratio); node_A1(i)=Mutation(node_A(i),prf1*w2,tt_ratio); node_B0(i)=Mutation(node_B(i),prf1*w3,tt_ratio); node_B1(i)=Mutation(node_B(i),prf1*w4,tt_ratio); end % ------------------------------------------------------------- % 1st recombination event: A0 <-- B0 % -------------------------------------------------------------- % x=node_A0(n1:n1+nlengthA-1); node_A0(n1:n1+nlengthA-1)=node_B0(n1:n1+nlengthA-1); % node_B0(n1:n1+nlengthA-1)=x; % --------------------------------------------------------- % Second-layer nodes between recombination events % --------------------------------------------------------- for i=1:N node_A0(i)=Mutation(node_A0(i),qrf1*w1,tt_ratio); node_A1(i)=Mutation(node_A1(i),qrf1*w2,tt_ratio); node_B0(i)=Mutation(node_B0(i),qrf1*w3,tt_ratio); node_B1(i)=Mutation(node_B1(i),qrf1*w4,tt_ratio); end % ------------------------------------------------------------- % 2nd recombination event: A0 <-- B1 % -------------------------------------------------------------- % x=node_A0(n2:n2+nlengthB-1); node_A0(n2:n2+nlengthB-1)=node_B1(n2:n2+nlengthB-1); % node_B0(n2:n2+nlengthB-1)=x; % --------------------------------------------------------- % Second-layer nodes after recombination % --------------------------------------------------------- for i=1:N node_A0(i)=Mutation(node_A0(i),qrf2*w1,tt_ratio); node_A1(i)=Mutation(node_A1(i),qrf2*w2,tt_ratio); node_B0(i)=Mutation(node_B0(i),qrf2*w3,tt_ratio); node_B1(i)=Mutation(node_B1(i),qrf2*w4,tt_ratio); end % --------------------------------------------------------- % Second-layer nodes: differently diverged region % --------------------------------------------------------- for i=n3:n3+nlengthC-1 node_A0(i)=Mutation(node_A(i),factor_w*w1,tt_ratio); node_A1(i)=Mutation(node_A(i),factor_w*w2,tt_ratio); node_B0(i)=Mutation(node_B(i),factor_w*w3,tt_ratio); node_B1(i)=Mutation(node_B(i),factor_w*w4,tt_ratio); end % --------------------------------------------------------- % Save to file % --------------------------------------------------------- dna_data(1,:)=node_A0; dna_data(2,:)=node_A1; dna_data(3,:)=node_B0; dna_data(4,:)=node_B1; if 0 save dna.dat dna_data -ascii; % Strange enough, the following doesn't work: ! sed 's/.0000000e+00//g' dna.dat > stercus.dat ! sed 's/ //g' stercus.dat > stercus1.dat ! sed 's/1/A/g' stercus1.dat > stercus2.dat ! sed 's/2/C/g' stercus2.dat > stercus3.dat ! sed 's/3/G/g' stercus3.dat > stercus4.dat ! sed 's/4/T/g' stercus4.dat > stercus5.dat ! /bin/rm stercus.dat ! /bin/rm stercus1.dat ! /bin/rm stercus2.dat ! /bin/rm stercus3.dat ! /bin/rm stercus4.dat end