Ado012
diff --git a/‎BasicCisElementAnalyzer.R
+18-16 b/‎BasicCisElementAnalyzer.R
+18-16
diff --git a/‎README.md
+45 b/‎README.md
+45
@@ -310,7 +310,7 @@ clusterScanner <- function(motifHits, arabSeqCS)
           print("clusterSeq")
           print(clusterSeq)
 
-          
+          #convert cluster to char format
           clusterSequlchar <-
             as.character(unlist(clusterSeq))
 
@@ -419,18 +419,18 @@ complexCoreScanner <-
 
 
       #complex core marking loop
-      for (i in 1:length(clusterCoreHits))
+      for (i in 1:length(clusterCoreHits)) #while still in list of core hits in the cluster
       {
         print("i")
         print(i)
 
         #if loop is not over
-        if (length(clusterCoreHits) >= (i + 1))
+        if (length(clusterCoreHits) >= (i + 1)) #if the size of the core hit list is at least one larger than the current position
         {
-          if (abs(clusterCoreHits[i] - clusterCoreHits[i + 1]) <= 4)
+          if (abs(clusterCoreHits[i] - clusterCoreHits[i + 1]) <= 4) #if next core hit is 4 positions or less away
           {#initialize or extend complex core if cores are close enough
             if (compCoreFlag==0)
-            { compCoreStart<-clusterCoreHits[i]
+            { compCoreStart<-clusterCoreHits[i] 
             print("comp core start")
             print(clusterCoreHits[i])
             }
@@ -440,7 +440,7 @@ complexCoreScanner <-
             print(clusterCoreHits[i])
           }
 
-          #if core spacing is high and a complex core is already started
+          #if the next core is more than 4 away and a complex core is already started
           else if ((abs(clusterCoreHits[i] - clusterCoreHits[i + 1]) > 4) &&
                    compCoreFlag >= 1)
           {#end the core and upload info 
@@ -521,26 +521,26 @@ coreHits<-clusterInfoMS$coreHits
 
 
       #repeat slowly
-      #look around each core in cluster from left and right margins to see if surrounding cores are in a phasing pattern
+      #walk through core hits of cluster testing phasing pattern
       for (i in 1:length(coreHits))
 
       {
         j = 1
-        #while distance is less than 50, test phasing among front cores
+        #while next core is within core list and distance is less than 50, measure phasing from core i to all other cores in list
         while ((i+j <= length(coreHits)) && abs(coreHits[i] - coreHits[i + j]) < 50)
         {
           frontPhase <- abs(coreHits[i] - coreHits[i + j])
 
-          
-          if (frontPhase >= 9 &&
+          #within these distances gets 1 point in phasing score
+          if (frontPhase >= 9 &&       
               frontPhase <= 11 ||
               frontPhase >= 20 &&
               frontPhase <= 22 ||
               frontPhase >= 30 &&
               frontPhase <= 33 || frontPhase >= 41 && frontPhase <= 44)
           {phaseScore = phaseScore + 1}
 
-          
+          #within these distances gets 0.5 point in phasing score
           else if (frontPhase == 8 ||
                    frontPhase == 12 ||
                    frontPhase == 19 ||
@@ -574,13 +574,15 @@ coreHits<-clusterInfoMS$coreHits
       print("motifComplexCoreScores")
       print(motifComplexCoreScore)
 
+      #unlist complex core score to a value to prepare it for storage
       motifComplexCoreScoreSum<-sum(unlist(motifComplexCoreScore))
 
       print("motifComplexCoreScoreSum")
       print(motifComplexCoreScoreSum)
 
       clusterSeqULchar<-as.character(unlist(clusterInfoMS$clusterSeq))
 
+      #dividing phasing score by bases in cluster and round to 5 decimel places for phasing score per base
       phasePerBase<-motifPhaseScoreSum/nchar(clusterSeqULchar)
       phasePerBase<-round(phasePerBase,5)
 
@@ -629,7 +631,7 @@ motifWriter<-function(clusterInfoMW, chromMW, targetMW, arabSeqMW, fileStreamMW,
 
   #write info to file
   write.table(motifClusterdf,append=TRUE, col.names=FALSE, row.names=FALSE, quote=FALSE, sep=' ', fileStreamMW)
-  print("babe")
+  print("check")
 
   #write info to bed file 
   write.table(data.frame(chromMW,clusterInfoMW$clusterStart,clusterInfoMW$clusterEnd),append=TRUE, col.names= FALSE, row.names=FALSE, quote=FALSE, sep="\t",fileStreamBAMMW)
@@ -736,16 +738,16 @@ arabMotifSearch <- function(targetfile, resultsOutput="coreMotifOutput", results
 
       print("check13")
 
-      #scan for complex cores
+      #scan clusters one by one for complex cores
       clusterInfo<-complexCoreScanner(clusterStartCCS=clusterList$clusterStartList[i], clusterEndCCS=clusterList$clusterEndList[i], 
                                       clusterSeqCCS=clusterList$clusterSeq[i])
 
       print("check14")
 
-      #score 
+      #score cluster for complex core score and phasing score
       clusterAnnotated <-motifScorer(clusterInfoMS=clusterInfo)
 
-      #write
+      #arrange and write file output and bed output
       result<-motifWriter(clusterInfoMW=clusterAnnotated, chromMW=chrom, targetMW=targetEntry[j], arabSeqMW=arabSeq, 
                           fileStreamMW=fileStream, fileStreamBAMMW=fileStreamBAM, targetDataMW=targetMetachar[j])
 
@@ -760,7 +762,7 @@ arabMotifSearch <- function(targetfile, resultsOutput="coreMotifOutput", results
 #read in results
   resultList = read.table(resultsOutput, header=TRUE)
 
-  #sort results 
+  #sort results and write sorted file 
   attach(resultList)
   resultsList<-resultList[order(phasescore),]
   detach(resultList)
 
@@ -0,0 +1,45 @@
+
+PURPOSE: 
+
+To examine the relationship between cis elements and transcriptional output on a larger scale the CRM structure of a list of upregulated and downregulated wus genes was analyzed computationally.
+
+Processing begins with the main function Arabmotifsearch(). A target file containing the genes to be analyzed is input along with the desired names of the output files. A GFF file containing annotation information for Arabidopsis genes is also read in. The annoation file is broken into several pieces including chromosome, gene names, and start and end positions. A filestream is started for the output files. 
+
+Targets are extracted from the target list one by one. For each target the function targetScanner() is run associating the target with a chromosome and sequence. The function MotifPrelimScanner() scans the sequence defined from 3000 before the gene start to 3000 after the gene end for TAAT/ATTA cis elements. The cis elements list is fed into ClusterScanner() to detect cis element clusters/CRMs which are defined as strings of at least 4 cis elements which are within 50 bp of the last. 
+
+Each CRM is then scanned for complex cis elements in Complexcorescanner() which is defined as a string of cis elements which each cis elements is 4bp or less from the last one. Motifscorer() elements calculates phasing score which is defined by how well consecutive cis elements adhere to a 10.5x bp spacing relationship. Then complexcore score; which is the length of complex cis elements summed, the phasing per base; where the phasing is divided by the number of bp of the cluster is calculated.
+
+These results are arranged by gene and by cluster into columns and printed out by Motifwriter().  
+
+
+USEAGE: 
+
+Necessary files: 
+target file : ex upregulatedwusgenes_cyclo.csv
+Complete set of TAIR10 Chromosome files: ex Arabidopsis_thaliana.TAIR10.dna.chromosome.1.fa (from http://ftp.gramene.org/CURRENT_RELEASE/fasta/arabidopsis_thaliana/dna/). Place in Rdatafiles directory. Alternative chromosome fastas can be substituted by modifying the script.
+script: BasicCisElementAnalyzer.R. 
+
+Install R and ensure target file is in working directory. Run BasicCisElementAnalyzer.R. 
+
+sample command: arabMotifSearch(targetfile = "Rdatafiles/upregulatedwusgenes.csv", resultsOutput="Rdatafiles/MotifOutputUP.txt", resultsBed="Rdatafiles/MotifOutputBEDUP.bed", resultsSorted="Rdatafiles/MotifOutputSortedUP.txt")
+
+targetfile: list of genes to examine
+resultsOutput: desired name of main output file
+resultsBed: desired name of output file in BED format.
+resultsSorted: desired name sorted file (currently nonfunctional)
+
+chainstart: start of cluster
+chainend: end of cluster	
+phasescore: phasing score (how well cores adhere to a 10.5x bp spacing 	
+phasePerBase: phasing score divided by cluster bp
+corNum: number of cis elements
+complexCores: number of complex cis elements (multiple cis elements 4bp or less apart) 	
+complexCoreScore: length of complex cis elements summed across a cluster. 
+
+
+Loading BED files
+BED files can be loaded through a genome browser such as IGV from the Broad Institute. Simply download and run the browser. Load the TAIR10 Arabidopsis genome and then load the BED file as a custom track. 
+
+
+
+