# This script contains functions needed to run EM algorithm as published in Nair et al. 2014 as well as to handle # the results and create figures. # EM algorithm implementations and auxilary function em_shape_flip = function(c,q,data) { # EM algorithm to optimize classes profiles using data flipping. There are only # two flip states (sense, reversed) which do not need to be precised. K=dim(c)[1] N=dim(data)[1] l=array(dim=c(N,K,2)) p=array(dim=c(N,K,2)) for(i in 1:K) { c[i,]=c[i,]/mean(c[i,]) } rm=rowMeans(data) for(i in 1:N) { for(j in 1:K) {l[i,j,1] = sum(dpois(data[i,], c[j,] *rm[i],log=T)) } for(j in 1:K) {l[i,j,2] = sum(dpois(data[i,],rev(c[j,])*rm[i],log=T)) } } for(i in 1:N) { p[i,,] = q*exp(l[i,,] - max(l[i,,])) p[i,,] = p[i,,]/sum(p[i,,]) } q = apply(p, c(2,3), mean) c = (t(p[,,1]) %*% data) + (t(p[,,2]) %*% t(apply(data,1,rev))) c = c / apply(p,2,sum) return(list(c, q, p)) } em_shape_shift_flip = function(c,q,data) { # EM algorithm to optimize classes profiles using data flipping and shifting. The shifting paramter # is the second dimension of q. # # c a matrix (K rows x j columns) containing the K classes profiles to optimize # q the prior class probabilities # data a matrix of data where a row is a sample signal contained within j columns (bins) K = dim(c)[1] # number of classes L = dim(c)[2] # number of bins N = dim(data)[1] # number of data rows S = dim(q)[2] # shift in bins l = array(dim=c(N,K,S,2)) p = array(dim=c(N,K,S,2)) for(i in 1:K) { c[i,] = c[i,]/mean(c[i,]) } rm = matrix(nrow=N, ncol=S) for(k in 1:S) { rm[,k] = rowMeans(data[,k:(k+L-1)]) } for(i in 1:N) # classes { for(j in 1:K) # data rows { for(k in 1:S) # shift { l[i,j,k,1] = sum(dpois( data[i, k:(k+L-1)], c[j,]*rm[i, k], log=T)) # one orientation l[i,j,k,2] = sum(dpois(rev(data[i,])[k:(k+L-1)], c[j,]*rm[i, S-k+1], log=T)) # the other orientation } } } for(i in 1:N) { p[i,,,] = q*exp(l[i,,,]-max(l[i,,,])) p[i,,,] = p[i,,,]/sum(p[i,,,]) } q = apply(p, c(2,3,4), mean) c = 0 for(k in 1:S) { c = c + (t(p[,,k,1]) %*% data[,k :(k+L-1)]) + (t(p[,,k,2]) %*% t(apply(data[,(S-k+1):(S+L-k)], 1, rev))) } c = c / apply(p, 2, sum) m = sum((1:S)* apply(q,2,sum)) s = sum(((1:S) - m)**2*apply(q,2,sum))**0.5 for(i in 1:K) { q[i,,1] = q[i,,2] = sum(q[i,,]) * dnorm(1:S,floor(S/2)+1,s) / sum(dnorm(1:S,floor(S/2)+1,s)) / 2 } return(list(c,q,p)) } # recenter shift probability distribution to force it to be a gaussian reg_shift_flip= function(q) { # This function regularizes the prior distribution of the shift probabilities to # make it a centered gaussian distribution. # This is a modified version of reg_shift() designed to handle 3D array coming # from em_shape_flip_shift() K=dim(q)[1] S=dim(q)[2] # There is no biological reason to estimate the mean and the sd from each flip state # independently. Thus the mean and sd will be computed from the whole array at once. m= sum((1:S)* apply(q,2,sum)) s=sum(((1:S)-m)**2*apply(q,2,sum))**0.5 for (i in 1:K) { q[i,,1] = q[i,,2] = sum(q[i,,]) * dnorm(1:S,floor(S/2)+1,s) / sum(dnorm(1:S,floor(S/2)+1,s)) / 2 } return(q) } # load/dump prob array to hard drive em.write.prob <- function(P, file) { # Dumps a class probability array/matrix

as return by the EM algorithm on the hard drive, under compressed RDS # format. This function is the reciproque function of em.read.prob(). # # Returns A string containing the address of the written file # # P an array or a matrix containing the EM posteriori class probability for each datum in the original data. # file a string indicating in which file P will be dumped. saveRDS(P, file=file) return(file) } em.read.prob <- function(file) { # Reads a compressed RDS formatted dumped class probability array/matrix as return by the EM algorithm and returns it. # This function is the reciproque function of em.write.prob(). # # Returns a matrix or an array (depending whether shift was enabled during the EM run) # # file a string indicating the address of the file to read. if(!file.exists(file)) { stop(sprintf("Error! %s does not exist!", file)) } array <- readRDS(file) return(array) } # extract data from SGA files em.extract.unoriented.sga.class.max <- function(SGA, P, file, bin.size, shift=F, flip=F, class, threshold=NA) { # Extracts and updates positions from an SGA file data stored in a dataframe, given the results of EM classification

if the positions # were assigned to class with a probability higher than for any other class (and if specified if this probabilities # are bigger than , where threshold should be > 0.5). The extracted positions are then updated if needed and # written to a file . # # Extracted SGA coordinates are updates in 3 ways : # 1) EM was run allowing flip. For a given datum, for the class of interet, if the major flip state is forward, strand is # set to '+', otherwise '-'. # 2) EM was run allowing shift. For a given datum, for the class of interest, the positions are shifted by a value # corresponding to the weighted average (knowing ) of all the shift states for this class. # 3) EM was run allowing shift and flip. For a given datum, for the class of interest, if the major flip state is forward, # the shift uodate is the shift weighted average and the strand is set to '+'. Otherwise, the shift update is the # weighted average sign is reversed (+20bp -> -20bp) and the strand is set to '-'. # # SGA a dataframe containing the data of the sga file of interest. # P a numerical array, the results of the EM classification. # file a string, the path where the extracted data will be written. # bin.size an int, the em algorithm matrix input bin size # shift a boolean indicating whether shift was allowed. # flip a boolean indicating whether flip was allowed. # class an int, the class to extract features from. # threshold a float, optonal, the minimal class probability for a feature # to be extracted. d = length(dim(P)) n.datum = dim(P)[1] n.class = dim(P)[2] # check whether PROB and filtered SGA correspond, they should have the same number of rows if(n.datum != nrow(SGA)) { stop(sprintf("Error! Prob array and sga dataframe do not have the same number of rows (%d vs %d)!", n.datum, nrow(SGA))) } # check whether file directory exists if(!dir.exists(dirname(file))) { stop(sprintf("Error! Directory %s does not exist!", dirname(file))) } # check bin size is > 0 if(bin.size <= 0) { stop(sprintf("Error! Bin size should be > 0!")) } # check class is an int > 0 if(class <= 0 || class > n.class) { stop(sprintf("Error! Class number is incorrect (%d classes detected)", n.class)) } # check threshold if given is > 0.5 assign a row to one class only if(!is.na(threshold) && threshold <= 0.5) { stop(sprintf("Error! Threshold should be strictly bigger than 0.5 to unambigously assign one major class per datum!")) } # no shift nor flip : no coord update needed if((!shift && !flip)) { # check dimension if(d != 2) { stop(sprintf("Error! Without shift nor flip, probability array is expected to have 2 dimensions (%d found)!", d)) } if((!shift && flip) && (d != 3)) { stop(sprintf("Error! With flip, probability array is expected to have 3 dimensions (%d found)!", d)) } # order class prob by decreasing order and retrieve class with highest prob CLASS.ORDER = t(apply(P, 1, order, decreasing=T)) CLASS.MAX = CLASS.ORDER[,1] SELECTED.ROWS = which(CLASS.MAX == class) # threshold if(!is.na(threshold)) { PASS.THRESHOLD = P[SELECTED.ROWS,class] > threshold SELECTED.ROWS = SELECTED.ROWS[PASS.THRESHOLD] PASS.THRESHOLD = NULL } # rewrite sga file rewrite.sga(SGA, file, SELECTED.ROWS, POS.UPDATE=NULL, STRAND=NULL) CLASS.ORDER = CLASS.MAX = SELECTED.ROWS = NULL } # shift position update : for each datum, weighted average of all shifts for the most likely class else if(shift && !flip) { if(d != 3) { stop(sprintf("Error! With shift, probability array is expected to have 3 dimensions (%d found)!", d)) } # compute how each site should be moved to accound to the shifting probabilities # The shifting for each datum is computed as follows : # For a datum, the class probabilities is defined as the sum of the prob over all the shift states. # For a datum, the most likely class only is considered and a weighted average of the shift is computed accounting for # each shifting state probability. POS.UPDATE = compute.pos.update.shift(P, bin.size) # compute datum prob as the sum of the probabilities over all the shift states CLASS.PROB = apply(P,c(1,2),sum) CLASS.MAX = apply(CLASS.PROB,1,order, decreasing=T)[1,] SELECTED.ROWS = which(CLASS.MAX == class) # threshold if(!is.na(threshold)) { PASS.THRESHOLD = CLASS.PROB[SELECTED.ROWS,class] > threshold SELECTED.ROWS = SELECTED.ROWS[PASS.THRESHOLD] PASS.THRESHOLD = NULL } # select POS.UPDATE for kept datum POS.UPDATE = POS.UPDATE[SELECTED.ROWS] # rewrite sga file rewrite.sga(SGA, file, SELECTED.ROWS, POS.UPDATE, STRAND=NULL) CLASS.PROB = CLASS.MAX = SELECTED.ROWS = NULL } # flip position update : for each datum, check the major flip state and set strand accordingly else if(!shift && flip) { # check dimension if(d != 3) { stop(sprintf("Error! With flip, probability array is expected to have 3 dimensions (%d found)!", d)) } # class probability is the sum over the flip states CLASS.PROB = apply(P, c(1,2), sum) # order class prob by decreasing order and retrieve class with highest prob CLASS.MAX = t(apply(CLASS.PROB, 1, order, decreasing=T))[,1] SELECTED.ROWS = which(CLASS.MAX == class) # for each datum, for the most likely class, find most likely flip FLIP.PROB = matrix(nrow=n.datum, ncol=2) for(i in 1:n.datum) { FLIP.PROB[i,1] = P[i,CLASS.MAX[i],1] FLIP.PROB[i,2] = P[i,CLASS.MAX[i],2] } FLIP.MAX = t(apply(FLIP.PROB, 1, order, decreasing=T))[,1] STRAND = as.character(FLIP.MAX) STRAND[STRAND == '2'] = '-' STRAND[STRAND == '1'] = '+' # threshold if(!is.na(threshold)) { PASS.THRESHOLD = CLASS.PROB[SELECTED.ROWS,class] > threshold SELECTED.ROWS = SELECTED.ROWS[PASS.THRESHOLD] PASS.THRESHOLD = NULL } # select STRAND for kept datum STRAND = STRAND[SELECTED.ROWS] # rewrite sga file rewrite.sga(SGA, file, SELECTED.ROWS, POS.UPDATE=NULL, STRAND) CLASS.ORDER = CLASS.MAX = SELECTED.ROWS = STRAND = NULL } # shift and flip position update : for each datum, weighted average of forward OR reverse shifts for the most likely class # and most likely flip. else if(shift && flip) { if(d != 4) { stop(sprintf("Error! With shift, probability array is expected to have 4 dimensions (%d found)!", d)) } # # compute how each site should be moved to accound to the shifting probabilities # The shifting for each datum is computed as follows : # For a datum, the class probabilities are defined as the sum of the prob over all the shift states. # For a datum, the flip probabilities are defined as the sum of the prob over the shift states # For a datum, the most likely class only is considered and then the most likely flip state is considered. # If the most likely flip is forward then the weighted average is made using the forward flip shift probabilities only. # If the most likely flip is reverse then the weighted average is made using the reverse flip shift probabilities only. POS.UPDATE = compute.pos.update.shift.flip(P, bin.size) # class probability is the sum over the shift and flip states CLASS.PROB = apply(P,c(1,2),sum) # order class prob by decreasing order and retrieve class with highest prob CLASS.MAX = apply(CLASS.PROB,1,order, decreasing=T)[1,] SELECTED.ROWS = which(CLASS.MAX == class) # for each datum, for the most likely class, most likely flip is the sum over the shift prob FLIP.PROB = matrix(nrow=n.datum, ncol=2) for(i in 1:n.datum) { FLIP.PROB[i,1] = sum(P[i,CLASS.MAX[i],,1]) FLIP.PROB[i,2] = sum(P[i,CLASS.MAX[i],,2]) } FLIP.MAX = t(apply(FLIP.PROB, 1, order, decreasing=T))[,1] STRAND = as.character(FLIP.MAX) STRAND[STRAND == '1'] = '+' STRAND[STRAND == '2'] = '-' # threshold if(!is.na(threshold)) { PASS.THRESHOLD = CLASS.PROB[SELECTED.ROWS,class] > threshold SELECTED.ROWS = SELECTED.ROWS[PASS.THRESHOLD] PASS.THRESHOLD = NULL } # select POS.UPDATE and STRAND for kept datum POS.UPDATE = POS.UPDATE[SELECTED.ROWS] STRAND = STRAND[SELECTED.ROWS] # rewrite sga file rewrite.sga(SGA, file, SELECTED.ROWS, POS.UPDATE, STRAND) CLASS.PROB = CLASS.MAX = SELECTED.ROWS = NULL } SGA = FILERED.ROWS = NULL } rewrite.sga <- function(SGA, file, SELECTED.ROWS, POS.UPDATE=NULL, STRAND=NULL) { # Given a dataframe containing an SGA file data and a vector of line index of interest , # this function updates line positions according to and # the strand using , if needed. The results are written in a file . # # SGA a dataframe containing the data of the SGA file of interest # file a string, the file in which the results will be written. # SELECTED.ROWS a vector of int, the line index of to rewrite. # POS.UPDATE a vector of int, if given, the new position values for lines. # STRAND a vector of char, if given, the new value strand values in lines. SGA = SGA[SELECTED.ROWS,] # update sga positions if needed if(!is.null(POS.UPDATE)) { # checks whether POS.UPDATE and SELECTED.ROWS corresponds (should have the same length) if(length(POS.UPDATE) != length(SELECTED.ROWS)) { stop(sprintf("Error! Number of position updates differs from number of row to extract (%d vs %d)!", length(POS.UPDATE), length(SELECTED.ROWS))) } SGA[,3] = SGA[,3] + POS.UPDATE } # update sga strand if needed if(!is.null(STRAND)) { SGA[,4] = STRAND } # write SGA write.table(x=SGA, file=file, quote=F, sep='\t', row.names=F, col.names=F) SGA = NULL } compute.pos.update.shift <- function(P, bin.size) { # Given an array of probabilities fom EM classification using shift, this # function compute the weighted average shift for each datum, considering # only the major class of the datum. # # P a 3D array (n,k,s) of probabilities as returned by the EM algorithm. # n the number of sites, k the number of classes, s the number of shift # bin.size an int, the size of the vector count bin given to the EM, in bp. # # returns a vector of int, the average shift for each datum. n.datum = dim(P)[1] n.class = dim(P)[2] n.shift = dim(P)[3] # position difference in bp for each possible shift state SHIFT.DIFF = (-n.shift/2+(1:n.shift)-0.5)*bin.size # for each datum, the new class probs is the sum over the shift states probs # for each datum, select the class with the highest probability CLASS.PROB = apply(P,c(1,2),sum) CLASS.MAX = t(apply(CLASS.PROB, 1, order, decreasing = T))[,1] # compute weighted average shift for each datum # for each datum, compute the weighted average shifting for the most likely class # i.g. a datum with 2 classes of prob 0.3 and 0.7 -> class 2 is the most likely # for class 2, shift states prob are 0.2, 0.8, 0 # bin.size is 10 thus the shift states are -10bp, 0bp, +10bp # weighted shift average is (0.2*-10) + (0.8*0) + (0*10) = -2bp SHIFT.WEIGHT = vector(length=n.datum, mode="numeric") for(i in 1:n.datum) { SHIFT.WEIGHT[i] = round(P[i,CLASS.MAX[i],] %*% SHIFT.DIFF) } SHIFT.DIFF = CLASS.PROB = CLASS.MAX = NULL return(SHIFT.WEIGHT) } compute.pos.update.shift.flip <- function(P, bin.size) { # Given an array of probabilities from EM classification using shift and flip, # this function compute the weighted average shift for each datum, considering # only the major class of the datum. # # P a 4D array (n,k,s,2) of probabilities as returned by the EM algorithm. # n the number of sites, k the number of classes, s the number of shift # states, 2 the number of flip states # bin.size an int, the size of the vector count bin given to the EM, in bp. # # returns a vector of int, the average shift for each datum. n.datum = dim(P)[1] n.class = dim(P)[2] n.shift = dim(P)[3] # position difference in bp for each possible shift state, for both flip states SHIFT.DIFF.fw = (-n.shift/2+(1:n.shift)-0.5)*bin.size SHIFT.DIFF.rev = (n.shift/2 -(1:n.shift)+0.5)*bin.size # for each datum, the new class probs is the sum over the shift and flip states probs # for each datum, select the class with the highest probability CLASS.PROB = apply(P,c(1,2),sum) CLASS.MAX = t(apply(CLASS.PROB, 1, order, decreasing = T))[,1] # for each datum, for the most likely class, most likely flip is the sum over the shift prob FLIP.PROB = matrix(nrow=n.datum, ncol=2) for(i in 1:n.datum) { FLIP.PROB[i,1] = sum(P[i,CLASS.MAX[i],,1]) FLIP.PROB[i,2] = sum(P[i,CLASS.MAX[i],,2]) } FLIP.MAX = t(apply(FLIP.PROB, 1, order, decreasing=T))[,1] # compute weighted average shift for each datum # for each datum, compute the weighted average shifting for the most likely class, for the most likely flip state # i.g. a datum with 2 classes of prob 0.3 and 0.7 -> class 2 is the most likely # a datum with 2 flip of prob 0.1 and 0.9 -> flip reverse is the most likely # for class 2, shift states prob are 0.2, 0, 0.1 for forward flip state # for class 2, shift states prob are 0.1, 0.1, 0.4 for reverse flip state # bin.size is 10, the shift states are -10bp, 0bp, +10bp # weighted shift average is done on the reverse flip state # reverse (0.1*+10) + (0.1*0) + (0.4*-10) = 1 + -4 = -3bp # weigthed average = -3bp SHIFT.WEIGHT = vector(length=n.datum, mode="numeric") for(i in 1:n.datum) { if(FLIP.MAX[i] == 1) { SHIFT.WEIGHT[i] = round(P[i,CLASS.MAX[i],,1] %*% SHIFT.DIFF.fw) } else { SHIFT.WEIGHT[i] = round(P[i,CLASS.MAX[i],,2] %*% SHIFT.DIFF.rev) } # SHIFT.FW = P[i,CLASS.MAX[i],,1] %*% SHIFT.DIFF.fw # SHIFT.REV = P[i,CLASS.MAX[i],,2] %*% SHIFT.DIFF.rev # SHIFT.WEIGHT[i] = round(SHIFT.FW + SHIFT.REV) } SHIFT.DIFF = CLASS.PROB = CLASS.MAX = FLIP.PROB = FLIP.MAX = SHIFT.FW = SHIFT.REV = NULL return(SHIFT.WEIGHT) } # results plotting # recompute class profile from array em.class.profiles.from.prob <- function(data, p, shift, flip) { # Recomputes the class aggregation patterns from the probability array/matrice and the raw data. # When the EM algorithm is run, it returns a matrice or an array with the given posteriori probabilities # for each datum, with a given shift (if enabled) to belong to a given class (with a given shift, if enabled) # as well as the classes aggregation pattern. This function allows to recompute the later. # # Returns a matrix # # data a matrice containing the data which were used to run the EM algorithm # p a matrix, a 3D array or a 4D array (if shift and flip were allowed) containing # the EM posteriori class probability for each datum of data. # shift a boolean indicating whether shift was allowed when running the EM algorithm. # flip a boolean indicating whether flip was allowed when running the EM algorith. # check probability array format if(!shift && !flip && (length(dim(p)) != 2)) { stop(sprintf("Error! Unknown probability array format! Array without shift nor flip has %d/2 dimensions", length(dim(p)))) } else if(shift && !flip && (length(dim(p)) != 3)) { stop(sprintf("Error! Unknown probability array format! Array with shift has %d/3 dimensions", length(dim(p)))) } else if(shift && flip && (length(dim(p)) != 4)) { stop(sprintf("Error! Unknown probability array format! Array with shift and flip has %d/4 dimensions", length(dim(p)))) } else if(!shift && flip && (length(dim(p)) != 3)) { stop(sprintf("Error! Unknown probability array format! Array with flip and no shift has %d/3 dimensions", length(dim(p)))) } else if(!shift && flip && (dim(p)[3] != 2)) { stop(sprintf("Error! Unknown probability array format! Array with flip and no shift should have 3rd dimension equal to two (two flip states, %d detected)", dim(p)[3])) } c = 0 # no shift no flip if(!shift && !flip) { c = (t(p) %*% data)/colSums(p) } # with shift only else if(shift && !flip) { S = dim(p)[3] L = dim(data)[2]-S for(s in 1:S) { # c = c + (t(p[,,s]) %*% data[,k:(s+L-1)])/colSums(p[,,s])} c = c + (t(p[,,s]) %*% data[,s:(s+L-1)]) } c = c / apply(p, 2, sum) } # with flip only else if(!shift && flip) { c = (t(p[,,1]) %*% data) +(t(p[,,2]) %*% t(apply(data,1,rev))) c = c / apply(p,2,sum) } # with shift and flip else if(shift && flip) { S = dim(p)[3] L = dim(data)[2]-S c = 0 for(s in 1:S) { c = c + (t(p[,,s,1]) %*% data[,s :(s+L-1)]) + (t(p[,,s,2]) %*% t(apply(data[,(S-s+1):(S+L-s)], 1, rev))) } c = c / apply(p, 2, sum) } return(c) } em.plot.class.profile <- function(data.list, prob, plot.title, file, iter.n, shift.n=0, flip=FALSE, col.names, lty, lwd, col, row.masker=NULL) { # Simple modification of em.plot.class.profile.multiple(). This function allows to set the line color instead of drawing each class with a different # color as em.plot.class.profile.multiple() does. # # Wrapper function to overlay EM class profiles of several datasets. Given the n sets of raw data and # a class probability matrice/array as returned by EM algorithm, the class profiles will be recomputed # (taking an eventual shift into account) and all the data classes ploted separately on a single figure. # # Returns Nothing # # data.list a list containing N data matrices (K rows x j columns) to be overlaid on the plot. The # first matrix of the list MUST be the one on which the EM algorithm has been run. A null row # filter will be set accordingly to this first matrix (rows corresponding to null rows in the 1st # matrix will be removed from the following matrices). # prob a matrix, a 3D array (if shift was allowed) or a 4D array (if shift and flip were allowed) containing # the EM posteriori class probability for each datum in the original data. # plot.title a string containing the plot title. # file a string, the file where the plot will be saved. # iter.n an int, the number of iteration of EM algorithm used to produce p and will be used as shifting. # information to recompute the class profiles. Will also be used as part of the file name. # shift.n an integer indicating the shift allowed when the EM algorithm was run. If shifting was not allowed, # this argument should be set to 0. # flip a boolean indicating whether flipping was allowed when the EM algorithm was run. # col.names a vector of strings containing the name the columns of the data on which the EM algorithm was run. # This will be used to label thick under the x-axis (taking a shift into account). # lty a vector of N int, indicating which line type (lty) should be used to plot each of the N data types. # lwd a vector of N int, indicating which line width (lwd) should be used to plot each of the N data types. # col a vector of N colors (int, str, any usable color code) indicating the color of each line. # row.masker a vector of at max K int specifying which data raw should not be drawn on the plots. If NULL, no # row masking will be performed and all the data will be ploted. # determine shift and flip if(shift.n > 0) { shift <- TRUE } else { shift <- FALSE } if(flip) { flip <- TRUE } else { flip <- FALSE } # check probability array format if(!shift && !flip && (length(dim(prob)) != 2)) { stop(sprintf("Error! Unknown probability array format! Array without shift nor flip has %d/2 dimensions", length(dim(prob)))) } else if(shift && !flip && (length(dim(prob)) != 3)) { stop(sprintf("Error! Unknown probability array format! Array with shift has %d/3 dimensions", length(dim(prob)))) } else if(shift && flip && (length(dim(prob)) != 4)) { stop(sprintf("Error! Unknown probability array format! Array with shift and flip has %d/4 dimensions", length(dim(prob)))) } else if(!shift && flip && (length(dim(prob)) != 3)) { stop(sprintf("Error! Unknown probability array format! Array with flip and no shift has %d/3 dimensions", length(dim(prob)))) } else if(!shift && flip && (dim(prob)[3] != 2)) { stop(sprintf("Error! Unknown probability array format! Array with flip and no shift should have 3rd dimension equal to two (two flip states, %d detected)", dim(prob)[3])) } # number of datasets present n.data <- length(data.list) # number of classes K <- dim(prob)[2] # number of elements (data row) N <- dim(prob)[1] if(length(lty) != n.data) { stop("Error! lty argument is too short or too long! Should be the same length as data.list.") } if(length(lwd) != n.data) { stop("Error! lwd argument is too short or too long! Should be the same length as data.list.") } if(length(row.masker) > dim(prob)[1]) { stop("Error! row.maker is too long! It cannot contain more elements than the data matrix row number.") } # determine xlab marks and thicks given shift # compute from-to limits on xaxis with shift n.bins <- length(col.names) n.less.bin.left <- floor(shift.n/2) n.less.bin.right <- ceiling(shift.n/2) from.to <- col.names[n.less.bin.left:(n.bins-n.less.bin.right)] # for plotting purposes, determine where the ticks and what ticks to put on x-axis x.at <- seq(1,199-shift.n,11) x.axt <- from.to[x.at] # compute class profiles profil.list <- list() for(i in 1:n.data) { tmp.prob <- prob tmp.data <- data.list[[i]] if(!is.null(row.masker)) { # prob is an array if(shift.n) { tmp.prob <- tmp.prob[-row.masker,,] } # prob is a matrix else { tmp.prob <- tmp.prob[-row.masker,] } tmp.data <- tmp.data[-row.masker,] } tmp.c <- em.class.profiles.from.prob(tmp.data, tmp.prob, shift=shift, flip=flip) profil.list[[i]] <- tmp.c tmp.prob <- NULL tmp.data <- NULL } # compute class probability, 'apply(prob,2,sum)' works the same if prob is a matrix or an array if(is.null(row.masker)) { prob.vector <- apply(prob, 2, sum)/sum(prob) } else { # prob is an array if(shift.n) { tmp.prob <- prob[-row.masker,,] } # prob is a matrix else { tmp.prob <- prob[-row.masker,] } prob.vector <- apply(tmp.prob, 2, sum)/sum(tmp.prob) } # plot # address where the plot will be registered png(file, width=600, height=900) opar = par(mfrow=c(K+1,1)) y.lim <- c(0, 1.2) x.len <- dim(profil.list[[1]])[2] x <- c(1:x.len) # data aggregation plot # aggregation profile is the sum of the class profiles weighted by their probability plot.main <- sprintf("%s\nAggregation (N=%d)", plot.title, N) d <- prob.vector %*% profil.list[[1]] d <- d / max(d) plot(x, d, ylim=y.lim, main=plot.main, xlab="Approximated dist. to pattern [bp]", ylab="Prop. weighted signal", xaxt='n', yaxt='n', bty='n', col=col[1], type = 'l', lty=lty[1], lwd=lwd[1]) axis(1, at=x.at, labels=x.axt, las=2) axis(2, at=seq(0,1,0.25), labels=seq(0,1,0.25)) if(n.data > 1) { for(i in 2:n.data) { y <- prob.vector %*% profil.list[[i]] y <- y / max(y) lines(x, y, col=col[i], lty=lty[i], lwd=lwd[i]) } } # loop over classes -> 1 plot/class for(i in 1:K) { j <- 1 d <- profil.list[[j]][i,] / max(profil.list[[j]][i,]) plot.main <- sprintf("class %d/%d", i, K) plot(d, ylim=y.lim, main=plot.main, xlab="Approximated dist. to pattern [bp]", ylab="Prop. weighted signal", xaxt='n', yaxt='n', bty='n', col=col[j], type = 'l', lty=lty[j], lwd=lwd[j]) # writes class overall probability text(x.len-2, 1.1, labels=round(prob.vector[i],3), cex=1.5, col="black") # draw axis axis(1, at=x.at, labels=x.axt, las=2) axis(2, at=seq(0,1,0.25), labels=seq(0,1,0.25)) # loop over matrix -> add data to current plot if(n.data > 1) { for(j in 2:n.data) { x <- c(1:length(profil.list[[j]][i,])) y <- profil.list[[j]][i,] / max(profil.list[[j]][i,]) lines(x, y, col=col[j], lty=lty[j], lwd=lwd[j]) } } } par(opar) dev.off() } # wrappers to run EM clustering em.shape.shift.flip <- function(MATRIX, k=1, iter=10, shift=10, seeding="iterative") { # Wraper to run em.shape() with flip and with or without shift on a data matrix . The algorithm will # be run times, assigning each datum to all classes with a certain probability. After the run the # probability array(s) returned by em.shape() or any of its derivative function is(are) returned. # # Iterative seeding : the 1st class profile (flat class) is initialized to the mean of the data (mean per column). The # data are then iteratively classified using 1 more class (resetting an untrained flat class at each iteration) until # classes have been reached. Before adding an extra class, the algorithm is run times. # Random seeding : prior probabilities for each datum to belong to all classes are randomly assigned, using a Beta # distribution, at the beginning. # # Returns a list of numerical arrays. The list length is one if seeding is "random", if seeding is "iterative" # where each element is a classification probability array with 1,2,..., classes. # The array dimensions are (n,k,f) without shift or (n,k,s,f) with shift and flip where n is the number of # datum, k the number of classes, s the number of allowed shift states and f=2 (2 shift states : non-flip, # reversed). # # MATRIX a numerical matrix on which the classification is to be performed. # k the number of classes to be optimized by the EM algorithm IN ADDITION to the flat class # iter an int, the number of iteraction to optimized the current number of classes # shift an int, the maximum shift allowed for each data row (in bins) # seeding a string indicating the initializing strategy to use ("iterative" or "random"). Using # "iterative" will lead to the use of an untrained flat class to which one class will be # added a the time until reaching final classes. When using "random", the class probabilities # are initialized randomly following a beta distribution. p_list= list() warning.flag = FALSE if(seeding != "iterative" && seeding != "random") { stop("Error! Incorrect seeding method!") } ## EM probabilistic clustering ## # without shifting if(shift == 0) { N = dim(MATRIX)[1] # number of bins in each sample L = dim(MATRIX)[2] # number of bins in the pattern # using iterative partitioning with untrained flat class if(seeding=="iterative") { K = k -1 # number of classes, with a flat class -> final number of class is k # code adapted from http://ccg.vital-it.ch/var/sib_april15/cases/nair12/ctcf_ChipPart.html#hide4 c = colMeans(MATRIX) # contains the class profiles to optimize, contains less bins than sample because of the shifting flat = matrix(data=mean(MATRIX), nrow=1, ncol=L) # contains the flat class profile which value is the mean of the data q = q0 = dnorm(1:2, floor(1/2)+1, 1) / sum(dnorm(1:2, floor(1/2)+1, 1)) # contains the prior probability for each class, detrmines the number of classes when em is run warning.flag = FALSE for (m in 1:K) { c = rbind(flat,c) q = rbind(q0/m,q) q = q / sum(q) for(i in 1:iter) { c[1,] = flat em.return = em_shape_flip(c,q,MATRIX) c = em.return[[1]]; q = em.return[[2]]; p = em.return[[3]]; em.return = NULL # check whether a class has an overall probability of 0, if true algorithm stops # this should be checked because it means that some prob values are now too small to be # properly represented. Once it is 0, it will remain 0 and can lead to troubles if too # many of them appears (like division by 0). if(length(which(q == 0)) != 0) { warning.flag = TRUE warning.msg = capture.output(q) warning.msg[1] = paste0(sprintf("nNumber of data row used : %d\nAlgorithm stopped at %d classes, %d iteration! A class posterior probability is zero!\n", nrow(MATRIX), m, i), warning.msg[1]) warning.msg = paste(warning.msg, collapse = "\n") warning(warning.msg) break } } if(warning.flag) { break } p_list[[m]] = p } } # using random seeding else if(seeding=="random") { K = k # number of classes p = matrix(nrow=N, ncol=K) q = matrix(nrow=K, ncol=2) for(m in 1:K) { p[,m] = rbeta(N, N**-0.5, 1) } c = (t(p) %*% MATRIX) / colSums(p) q[,1] = q[,2] = rep(1/K, K) q = q / sum(q) for(i in 1:iter) { em.return = em_shape_flip(c, q, MATRIX) c = em.return[[1]]; q = em.return[[2]]; p = em.return[[3]] em.return = NULL # check whether a class has an overall probability of 0, if true algorithm stops # this should be checked because it means that some prob values are now too small to be # properly represented. Once it is 0, it will remain 0 and can lead to troubles if too # many of them appears (like division by 0). if(length(which(q == 0)) != 0) { warning.flag = TRUE warning.msg = capture.output(q) warning.msg[1] = paste0(sprintf("Number of data row used : %d\nAlgorithm stopped at %d classes, %d iteration! A class posterior probability is zero!\n", nrow(MATRIX), m, i), warning.msg[1]) warning.msg = paste(warning.msg, collapse = "\n") warning(warning.msg) break } } p_list[[1]] = p } p = NULL } # with shifting else { S = shift # maximum shift in bins N = dim(MATRIX)[1] # number of bins in each sample L = dim(MATRIX)[2] - S # number of bins in the pattern taking the shift into account # using iterative partitioning with untrained flat class if(seeding=="iterative") { # code adapted from http://ccg.vital-it.ch/var/sib_april15/cases/nair12/ctcf_ChipPart.html#hide4 K = k -1 # number of classes, with a flat class -> final number of class is k c = colMeans(MATRIX[,(floor(S/2)+1):(floor(S/2)+L)]) # contains the class profiles to optimize, contains less bins than sample because of the shifting flat = matrix(data=mean(MATRIX), nrow=1, ncol=L) # contains the flat class profile which value is the mean of the data q = array(dim=c(1,S,2)) # contains the prior probability for each class, detrmines the number of classes when em is run q[,,1] = dnorm(1:S,floor(S/2)+1,1)/sum(dnorm(1:S,floor(S/2)+1,1)) / 2 # each class starts with equal prob for both flip states q[,,2] = dnorm(1:S,floor(S/2)+1,1)/sum(dnorm(1:S,floor(S/2)+1,1)) / 2 # DIVIDED BY TWO BECAUSE EACH CLASS HAS TWO FLIP STATES q0 = q warning.flag = FALSE for (m in 1:K) { c = rbind(flat,c) q.tmp = array(dim=c(m+1,S,2)) q.tmp[2:(m+1),,] = q q.tmp[1,,] = q0/m q.tmp = q.tmp/sum(q.tmp) q = q.tmp q.tmp = NULL for(i in 1:iter) { q = reg_shift_flip(q) c[1,] = flat em.return = em_shape_shift_flip(c,q,MATRIX) c = em.return[[1]]; q = em.return[[2]]; p = em.return[[3]]; em.return = NULL # check whether a class has an overall probability of 0, if true algorithm stops # this should be checked because it means that some prob values are now too small to be # properly represented. Once it is 0, it will remain 0 and can lead to troubles if too # many of them appears (like division by 0). if(length(which(q == 0)) != 0) { warning.flag = TRUE warning.msg = capture.output(q) warning.msg[1] = paste0(sprintf("Number of data row used : %d\nAlgorithm stopped at %d classes, %d iteration! A class posterior probability is zero!\n", nrow(MATRIX), m, i), warning.msg[1]) warning.msg = paste(warning.msg, collapse = "\n") warning(warning.msg) break } } if(warning.flag) { break } p_list[[m]] = p } } # using random seeding else if(seeding=="random") { K = k # number of classes p = matrix(nrow=N, ncol=K) q = array(dim=c(K, S, 2)) for(i in 1:K) { p[,i] = rbeta(N, N**-0.5, 1) } c = (t(p) %*% MATRIX[,(floor(S/2)+1):(floor(S/2)+L+1)]) / colSums(p) for(m in 1:K) { q[m,,] = matrix(nrow=S, ncol=2, data=rep(1/K, S*2)) } q = q / sum(q) for(i in 1:iter) { q = reg_shift_flip(q) em.return = em_shape_shift_flip(c, q, MATRIX) c = em.return[[1]]; q = em.return[[2]]; p = em.return[[3]] em.return = NULL # check whether a class has an overall probability of 0, if true algorithm stops # this should be checked because it means that some prob values are now too small to be # properly represented. Once it is 0, it will remain 0 and can lead to troubles if too # many of them appears (like division by 0). if(length(which(q == 0)) != 0) { warning.flag = TRUE warning.msg = capture.output(q) warning.msg[1] = paste0(sprintf("Number of data row used : %d\nAlgorithm stopped at %d classes, %d iteration! A class posterior probability is zero!\n", nrow(MATRIX), m, i), warning.msg[1]) warning.msg = paste(warning.msg, collapse = "\n") warning(warning.msg) break } } p_list[[1]] = p } p = NULL } return(p_list) }