Skip to content

Commit a5084e2

Browse files
committed
added readme and results files
1 parent 6411efe commit a5084e2

9 files changed

+3723
-16
lines changed

BasicCisElementAnalyzer.R

+18-16
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,7 @@ clusterScanner <- function(motifHits, arabSeqCS)
310310
print("clusterSeq")
311311
print(clusterSeq)
312312

313-
313+
#convert cluster to char format
314314
clusterSequlchar <-
315315
as.character(unlist(clusterSeq))
316316

@@ -419,18 +419,18 @@ complexCoreScanner <-
419419

420420

421421
#complex core marking loop
422-
for (i in 1:length(clusterCoreHits))
422+
for (i in 1:length(clusterCoreHits)) #while still in list of core hits in the cluster
423423
{
424424
print("i")
425425
print(i)
426426

427427
#if loop is not over
428-
if (length(clusterCoreHits) >= (i + 1))
428+
if (length(clusterCoreHits) >= (i + 1)) #if the size of the core hit list is at least one larger than the current position
429429
{
430-
if (abs(clusterCoreHits[i] - clusterCoreHits[i + 1]) <= 4)
430+
if (abs(clusterCoreHits[i] - clusterCoreHits[i + 1]) <= 4) #if next core hit is 4 positions or less away
431431
{#initialize or extend complex core if cores are close enough
432432
if (compCoreFlag==0)
433-
{ compCoreStart<-clusterCoreHits[i]
433+
{ compCoreStart<-clusterCoreHits[i]
434434
print("comp core start")
435435
print(clusterCoreHits[i])
436436
}
@@ -440,7 +440,7 @@ complexCoreScanner <-
440440
print(clusterCoreHits[i])
441441
}
442442

443-
#if core spacing is high and a complex core is already started
443+
#if the next core is more than 4 away and a complex core is already started
444444
else if ((abs(clusterCoreHits[i] - clusterCoreHits[i + 1]) > 4) &&
445445
compCoreFlag >= 1)
446446
{#end the core and upload info
@@ -521,26 +521,26 @@ coreHits<-clusterInfoMS$coreHits
521521

522522

523523
#repeat slowly
524-
#look around each core in cluster from left and right margins to see if surrounding cores are in a phasing pattern
524+
#walk through core hits of cluster testing phasing pattern
525525
for (i in 1:length(coreHits))
526526

527527
{
528528
j = 1
529-
#while distance is less than 50, test phasing among front cores
529+
#while next core is within core list and distance is less than 50, measure phasing from core i to all other cores in list
530530
while ((i+j <= length(coreHits)) && abs(coreHits[i] - coreHits[i + j]) < 50)
531531
{
532532
frontPhase <- abs(coreHits[i] - coreHits[i + j])
533533

534-
535-
if (frontPhase >= 9 &&
534+
#within these distances gets 1 point in phasing score
535+
if (frontPhase >= 9 &&
536536
frontPhase <= 11 ||
537537
frontPhase >= 20 &&
538538
frontPhase <= 22 ||
539539
frontPhase >= 30 &&
540540
frontPhase <= 33 || frontPhase >= 41 && frontPhase <= 44)
541541
{phaseScore = phaseScore + 1}
542542

543-
543+
#within these distances gets 0.5 point in phasing score
544544
else if (frontPhase == 8 ||
545545
frontPhase == 12 ||
546546
frontPhase == 19 ||
@@ -574,13 +574,15 @@ coreHits<-clusterInfoMS$coreHits
574574
print("motifComplexCoreScores")
575575
print(motifComplexCoreScore)
576576

577+
#unlist complex core score to a value to prepare it for storage
577578
motifComplexCoreScoreSum<-sum(unlist(motifComplexCoreScore))
578579

579580
print("motifComplexCoreScoreSum")
580581
print(motifComplexCoreScoreSum)
581582

582583
clusterSeqULchar<-as.character(unlist(clusterInfoMS$clusterSeq))
583584

585+
#dividing phasing score by bases in cluster and round to 5 decimel places for phasing score per base
584586
phasePerBase<-motifPhaseScoreSum/nchar(clusterSeqULchar)
585587
phasePerBase<-round(phasePerBase,5)
586588

@@ -629,7 +631,7 @@ motifWriter<-function(clusterInfoMW, chromMW, targetMW, arabSeqMW, fileStreamMW,
629631

630632
#write info to file
631633
write.table(motifClusterdf,append=TRUE, col.names=FALSE, row.names=FALSE, quote=FALSE, sep=' ', fileStreamMW)
632-
print("babe")
634+
print("check")
633635

634636
#write info to bed file
635637
write.table(data.frame(chromMW,clusterInfoMW$clusterStart,clusterInfoMW$clusterEnd),append=TRUE, col.names= FALSE, row.names=FALSE, quote=FALSE, sep="\t",fileStreamBAMMW)
@@ -736,16 +738,16 @@ arabMotifSearch <- function(targetfile, resultsOutput="coreMotifOutput", results
736738

737739
print("check13")
738740

739-
#scan for complex cores
741+
#scan clusters one by one for complex cores
740742
clusterInfo<-complexCoreScanner(clusterStartCCS=clusterList$clusterStartList[i], clusterEndCCS=clusterList$clusterEndList[i],
741743
clusterSeqCCS=clusterList$clusterSeq[i])
742744

743745
print("check14")
744746

745-
#score
747+
#score cluster for complex core score and phasing score
746748
clusterAnnotated <-motifScorer(clusterInfoMS=clusterInfo)
747749

748-
#write
750+
#arrange and write file output and bed output
749751
result<-motifWriter(clusterInfoMW=clusterAnnotated, chromMW=chrom, targetMW=targetEntry[j], arabSeqMW=arabSeq,
750752
fileStreamMW=fileStream, fileStreamBAMMW=fileStreamBAM, targetDataMW=targetMetachar[j])
751753

@@ -760,7 +762,7 @@ arabMotifSearch <- function(targetfile, resultsOutput="coreMotifOutput", results
760762
#read in results
761763
resultList = read.table(resultsOutput, header=TRUE)
762764

763-
#sort results
765+
#sort results and write sorted file
764766
attach(resultList)
765767
resultsList<-resultList[order(phasescore),]
766768
detach(resultList)

README.md

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
2+
PURPOSE:
3+
4+
To examine the relationship between cis elements and transcriptional output on a larger scale the CRM structure of a list of upregulated and downregulated wus genes was analyzed computationally.
5+
6+
Processing begins with the main function Arabmotifsearch(). A target file containing the genes to be analyzed is input along with the desired names of the output files. A GFF file containing annotation information for Arabidopsis genes is also read in. The annoation file is broken into several pieces including chromosome, gene names, and start and end positions. A filestream is started for the output files.
7+
8+
Targets are extracted from the target list one by one. For each target the function targetScanner() is run associating the target with a chromosome and sequence. The function MotifPrelimScanner() scans the sequence defined from 3000 before the gene start to 3000 after the gene end for TAAT/ATTA cis elements. The cis elements list is fed into ClusterScanner() to detect cis element clusters/CRMs which are defined as strings of at least 4 cis elements which are within 50 bp of the last.
9+
10+
Each CRM is then scanned for complex cis elements in Complexcorescanner() which is defined as a string of cis elements which each cis elements is 4bp or less from the last one. Motifscorer() elements calculates phasing score which is defined by how well consecutive cis elements adhere to a 10.5x bp spacing relationship. Then complexcore score; which is the length of complex cis elements summed, the phasing per base; where the phasing is divided by the number of bp of the cluster is calculated.
11+
12+
These results are arranged by gene and by cluster into columns and printed out by Motifwriter().
13+
14+
15+
USEAGE:
16+
17+
Necessary files:
18+
target file : ex upregulatedwusgenes_cyclo.csv
19+
Complete set of TAIR10 Chromosome files: ex Arabidopsis_thaliana.TAIR10.dna.chromosome.1.fa (from http://ftp.gramene.org/CURRENT_RELEASE/fasta/arabidopsis_thaliana/dna/). Place in Rdatafiles directory. Alternative chromosome fastas can be substituted by modifying the script.
20+
script: BasicCisElementAnalyzer.R.
21+
22+
Install R and ensure target file is in working directory. Run BasicCisElementAnalyzer.R.
23+
24+
sample command: arabMotifSearch(targetfile = "Rdatafiles/upregulatedwusgenes.csv", resultsOutput="Rdatafiles/MotifOutputUP.txt", resultsBed="Rdatafiles/MotifOutputBEDUP.bed", resultsSorted="Rdatafiles/MotifOutputSortedUP.txt")
25+
26+
targetfile: list of genes to examine
27+
resultsOutput: desired name of main output file
28+
resultsBed: desired name of output file in BED format.
29+
resultsSorted: desired name sorted file (currently nonfunctional)
30+
31+
chainstart: start of cluster
32+
chainend: end of cluster
33+
phasescore: phasing score (how well cores adhere to a 10.5x bp spacing
34+
phasePerBase: phasing score divided by cluster bp
35+
corNum: number of cis elements
36+
complexCores: number of complex cis elements (multiple cis elements 4bp or less apart)
37+
complexCoreScore: length of complex cis elements summed across a cluster.
38+
39+
40+
Loading BED files
41+
BED files can be loaded through a genome browser such as IGV from the Broad Institute. Simply download and run the browser. Load the TAIR10 Arabidopsis genome and then load the BED file as a custom track.
42+
43+
44+
45+

0 commit comments

Comments
 (0)