00001 package org.gel.mauve.assembly;
00002
00003 import java.awt.Dimension;
00004 import java.io.File;
00005 import java.text.NumberFormat;
00006 import java.util.HashMap;
00007 import java.util.Iterator;
00008 import java.util.Map;
00009
00010 import javax.swing.JOptionPane;
00011 import javax.swing.JTable;
00012 import javax.swing.JTextArea;
00013 import javax.swing.table.DefaultTableModel;
00014
00015 import org.apache.commons.cli.CommandLine;
00016 import org.apache.commons.cli.HelpFormatter;
00017 import org.apache.commons.cli.Option;
00018 import org.apache.commons.cli.Options;
00019 import org.biojava.bio.symbol.SymbolList;
00020 import org.gel.mauve.BaseViewerModel;
00021 import org.gel.mauve.Chromosome;
00022 import org.gel.mauve.OptionsBuilder;
00023 import org.gel.mauve.XmfaViewerModel;
00024 import org.gel.mauve.analysis.BrokenCDS;
00025 import org.gel.mauve.analysis.Gap;
00026 import org.gel.mauve.analysis.SNP;
00027 import org.gel.mauve.contigs.ContigOrderer;
00028 import org.gel.mauve.gui.AlignFrame;
00029 import org.gel.mauve.gui.AlignWorker;
00030 import org.gel.mauve.gui.AnalysisDisplayWindow;
00031 import org.gel.mauve.gui.MauveFrame;
00032
00033 public class ScoreAssembly {
00034
00035 private static final String[] DEFAULT_ARGS = {"--skip-refinement",
00036 "--weight=200"};
00037 private static String SUM_CMD = "Summary";
00038 private static String SUM_DESC = "Summary of scoring assembly";
00039 private static String SNP_CMD = "SNPs";
00040 private static String SNP_DESC = "SNPs between reference and assembly";
00041 private static String GAP_CMD = "Gaps";
00042 private static String GAP_DESC = "Gaps in reference and assembly";
00043 private static String CDS_CMD = "Broken CDS";
00044 private static String CDS_DESC = "Broken CDS in assembly genome";
00045 private static String INV_CMD = "Inverted Contigs";
00046 private static String INV_DESC = "Incorrectly inverted contigs";
00047 private static String MIS_CMD = "Mis-assembled Contigs";
00048 private static String MIS_DESC = "Mis-assembled contigs with the total number of mis-assemblies in each";
00049
00050 private static HashMap<String,ScoreAssembly> modelMap = new HashMap<String, ScoreAssembly>();
00051
00052 private static HashMap<String,ScoreAssembly> running = new HashMap<String, ScoreAssembly>();
00053
00054 private static final String temp = "Running...";
00055
00056 private static final String error = "Error computing DCJ distances! Please report bug to atritt@ucdavis.edu";
00057
00058 private JTextArea sumTA;
00059
00060 private XmfaViewerModel model;
00061
00062 private int fWIDTH = 800;
00063
00064 private int fHEIGHT = 510;
00065
00066 private AssemblyScorer asmScore;
00067
00068 private AnalysisDisplayWindow win;
00069
00070 private boolean finished;
00071
00072 private boolean cancel;
00073
00074 public ScoreAssembly(XmfaViewerModel model){
00075
00076 this.model = model;
00077 this.finished = false;
00078 initWithJTables(model);
00079 }
00080
00081 private void initWithJTables(XmfaViewerModel model){
00082 win = new AnalysisDisplayWindow("Score Assembly - "+model.getSrc().getName(), fWIDTH, fHEIGHT);
00083 sumTA = win.addContentPanel(SUM_CMD, SUM_DESC, true);
00084 sumTA.append(temp);
00085 this.win.showWindow();
00086 running.put(this.model.getSrc().getAbsolutePath(), this);
00087 new Thread( new Runnable (){
00088 public void run(){
00089 try {
00090 asmScore = new AssemblyScorer(ScoreAssembly.this.model);
00091 ScoreAssembly.this.finish();
00092 } catch (Exception e){
00093 ScoreAssembly.this.cancel();
00094 System.err.println("\nError\n");
00095 e.printStackTrace();
00096 }
00097 }
00098 }).start();
00099 }
00100
00101 public void cancel(){
00102 cancel = true;
00103 }
00104
00105 private synchronized void finish(){
00106 if (!cancel) {
00107 running.remove(this.model.getSrc().getAbsolutePath());
00108 modelMap.put(this.model.getSrc().getAbsolutePath(), this);
00109 sumTA.replaceRange("", 0, temp.length());
00110 sumTA.setText(getSumText(asmScore,true,false));
00111
00112 addTables();
00113 finished = true;
00114 win.showWindow();
00115 sumTA.setCaretPosition(0);
00116 }
00117 }
00118
00119 private void addTables(){
00120 SNP[] snps = asmScore.getSNPs();
00121 Object[] snpHeader = {"SNP_Pattern","Ref_Contig","Ref_PosInContig",
00122 "Ref_PosGenomeWide","Assembly_Contig",
00123 "Assembly_PosInContig","Assembly_PosGenomeWide"};
00124
00125 DefaultTableModel snpData = win.addContentTable(SNP_CMD, SNP_DESC, false);
00126 win.showWindow();
00127 snpData.setColumnIdentifiers(snpHeader);
00128 for (int snpI = 0; snpI < snps.length; snpI++){
00129 snpData.addRow(snps[snpI].toString().split("\t"));
00130 }
00131
00132 Gap[] refGaps = asmScore.getReferenceGaps();
00133 Gap[] assGaps = asmScore.getAssemblyGaps();
00134 int nGen = model.numGenomes();
00135 Object[] gapHeader = new Object[5+(nGen)*3];
00136 gapHeader[0] = "Sequence";
00137 gapHeader[1] = "Contig";
00138 gapHeader[2] = "Position_in_Contig";
00139 gapHeader[3] = "GenomeWide_Position";
00140 gapHeader[4] = "Length";
00141 int idx = 5;
00142 for (int j = 0; j < nGen; j++){
00143 gapHeader[idx++] = "sequence_"+j+"_pos";
00144 gapHeader[idx++] = "sequence_"+j+"_ctg";
00145 gapHeader[idx++] = "sequence_"+j+"_posInCtg";
00146 }
00147 DefaultTableModel gapData = win.addContentTable(GAP_CMD, GAP_DESC, false);
00148 gapData.setColumnIdentifiers(gapHeader);
00149 for (int gapI = 0; gapI < refGaps.length; gapI++)
00150 gapData.addRow(refGaps[gapI].toString().split("\t"));
00151 for (int gapI = 0; gapI < assGaps.length; gapI++)
00152 gapData.addRow(assGaps[gapI].toString().split("\t"));
00153 if (asmScore.hasBrokenCDS()){
00154 Object[] cdsHeader =
00155 {"CDS_ID","Peptide_Length",
00156 "Perc_IncorrectBases", "Broken_Frame_Segments",
00157 "Gap_Segments","Substituted_Positions","Substitutions",
00158 "Stop_Codon_Positions","Original_Residue"};
00159 DefaultTableModel cdsData = win.addContentTable(CDS_CMD, CDS_DESC, false);
00160 cdsData.setColumnIdentifiers(cdsHeader);
00161 BrokenCDS[] cds = asmScore.getBrokenCDS();
00162 for (BrokenCDS bcds: cds)
00163 cdsData.addRow(bcds.toString().split("\t"));
00164 }
00165 }
00166
00167 public static String getSumText(AssemblyScorer assScore, boolean header, boolean singleLine){
00168 NumberFormat nf = NumberFormat.getInstance();
00169 nf.setMaximumFractionDigits(4);
00170 StringBuilder sb = new StringBuilder();
00171 if (singleLine){
00172 if (header) {
00173 sb.append("Name\tNumContigs\tNumRefReplicons\tNumAssemblyBases\tNumReferenceBases\tNumLCBs\t" +
00174 "DCJ_Distance\tNumDCJBlocks\tNumSNPs\tNumMisCalled\tNumUnCalled\tNumGapsRef\tNumGapsAssembly\t" +
00175 "TotalBasesMissed\tPercBasesMissed\tExtraBases\tPercExtraBases" +
00176 "\tMissingChromosomes\tExtraContigs\tNumSharedBoundaries\tNumInterLcbBoundaries"+
00177 "\tBrokenCDS\tIntactCDS\tContigN50\tContigN90\tMinContigLength\tMaxContigLength"+
00178 "\tAA\tAC\tAG\tAT\tCA\tCC\tCG\tCT\tGA\tGC\tGG\tGT\tTA\tTC\tTG\tTT\n");
00179 }
00180 sb.append( assScore.getModel().getGenomeBySourceIndex(1).getDisplayName() + "\t" +
00181 assScore.numContigs()+"\t"+assScore.numReplicons()+"\t"+
00182 assScore.numBasesAssembly()+"\t"+assScore.numBasesReference()+"\t"+assScore.numLCBs()+"\t"+
00183 assScore.getDCJdist()+"\t"+assScore.numBlocks()+"\t"+assScore.getSNPs().length+"\t"+
00184 assScore.getMiscalled()+'\t'+assScore.getUncalled()+'\t'+
00185 assScore.getReferenceGaps().length+"\t"+assScore.getAssemblyGaps().length+"\t"+
00186 assScore.totalMissedBases()+"\t"+nf.format(assScore.percentMissedBases()*100)+"\t"+
00187 assScore.totalExtraBases()+"\t"+nf.format(assScore.percentExtraBases()*100) +"\t"+
00188 assScore.getMissingChromosomes().length+"\t"+assScore.getExtraContigs().length +"\t"+
00189 assScore.getSharedBoundaryCount()+"\t"+assScore.getInterLcbBoundaryCount());
00190 sb.append( "\t" + assScore.numBrokenCDS()+"\t"+assScore.numCompleteCDS());
00191 sb.append( "\t" + assScore.getContigN50() + "\t" + assScore.getContigN90() + "\t" + assScore.getMinContigLength() + "\t" + assScore.getMaxContigLength());
00192 int[][] subs = assScore.getSubs();
00193 for(int i=0; i<subs.length; i++)
00194 {
00195 for(int j=0; j<subs[i].length; j++)
00196 {
00197 sb.append('\t');
00198 sb.append(subs[i][j]);
00199 }
00200 }
00201 sb.append("\n");
00202 } else {
00203 if (header) {
00204 sb.append(
00205 "Number of Contigs:\t"+assScore.numContigs()+"\n"+
00206 "Number reference replicons:\t" + assScore.numReplicons()+"\n"+
00207 "Number of assembly bases:\t" + assScore.numBasesAssembly()+"\n"+
00208 "Number of reference bases:\t" + assScore.numBasesReference()+"\n"+
00209 "Number of LCBs:\t" + assScore.numLCBs()+"\n"+
00210 "Number of Blocks:\t"+assScore.numBlocks()+"\n"+
00211 "Breakpoint Distance:\t"+assScore.getBPdist()+"\n"+
00212 "DCJ Distance:\t"+assScore.getDCJdist()+"\n"+
00213 "SCJ Distance:\t"+assScore.getSCJdist()+"\n"+
00214 "Number of Complete Coding Sequences:\t"+assScore.numCompleteCDS()+"\n"+
00215 "Number of Broken Coding Sequences:\t"+assScore.numBrokenCDS()+"\n"+
00216 "Number of SNPs:\t"+assScore.getSNPs().length+"\n"+
00217 "Number of Gaps in Reference:\t"+assScore.getReferenceGaps().length+"\n"+
00218 "Number of Gaps in Assembly:\t"+assScore.getAssemblyGaps().length+"\n" +
00219 "Total bases missed in reference:\t" + assScore.totalMissedBases() +"\n"+
00220 "Percent bases missed:\t" + nf.format(assScore.percentMissedBases()*100)+" %\n"+
00221 "Total bases extra in assembly:\t" + assScore.totalExtraBases()+"\n" +
00222 "Percent bases extra:\t" + nf.format(assScore.percentExtraBases()*100)+ " %\n"+
00223 "Number of missing chromosomes:\t" + assScore.getMissingChromosomes().length +"\n"+
00224 "Number of extra contigs:\t"+assScore.getExtraContigs().length +"\n"+
00225 "Number of Shared Boundaries:\t"+assScore.getSharedBoundaryCount()+"\n"+
00226 "Number of Inter-LCB Boundaries:\t"+assScore.getInterLcbBoundaryCount()+"\n"+
00227 "Contig N50:\t"+assScore.getContigN50()+"\n"+
00228 "Contig N90:\t"+assScore.getContigN90()+"\n"+
00229 "Min contig length:\t"+assScore.getMinContigLength()+"\n"+
00230 "Max contig length:\t"+assScore.getMaxContigLength()+"\n"+
00231 "Substitutions (Ref on Y, Assembly on X):\n"+AssemblyScorer.subsToString(assScore)
00232 );
00233
00234
00235 } else {
00236 sb.append(
00237 assScore.numContigs()+"\n"+
00238 assScore.numLCBs()+"\n"+
00239 assScore.getDCJdist()+"\n"+
00240 assScore.numBlocks()+"\n"+
00241 assScore.getSNPs().length+"\n"+
00242 assScore.getReferenceGaps().length+"\n"+
00243 assScore.getAssemblyGaps().length+"\n" +
00244 assScore.totalMissedBases() +"\n"+
00245 nf.format(assScore.percentMissedBases()*100)+"\n"+
00246 assScore.totalExtraBases()+"\n" +
00247 nf.format(assScore.percentExtraBases()*100)+ "\n"+
00248 assScore.getMissingChromosomes() +"\n"+
00249 assScore.getExtraContigs()+"\n"+
00250 "Number of Shared Boundaries:\t"+assScore.getSharedBoundaryCount()+"\n"+
00251 "Number of Inter-LCB Boundaries:\t"+assScore.getInterLcbBoundaryCount()+"\n"+
00252 AssemblyScorer.subsToString(assScore));
00253 }
00254 }
00255 return sb.toString();
00256 }
00257
00258 public static void launchWindow(MauveFrame frame){
00259 BaseViewerModel model = frame.getModel();
00260 String key = model.getSrc().getAbsolutePath();
00261 if (modelMap.containsKey(key)){
00262
00263 startNewSA(frame);
00264 } else if (model instanceof XmfaViewerModel) {
00265 startNewSA(frame);
00266 } else {
00267 System.err.println("Can't score assembly unless I have an" +
00268 " XmfaViewerModel -- Please report this bug!");
00269 }
00270 }
00271
00272 private static void startNewSA(MauveFrame frame){
00273 interactive(frame);
00274 }
00275
00276 public static void interactive(MauveFrame frame){
00277 XmfaViewerModel model = (XmfaViewerModel) frame.getModel();
00278 String key = model.getSrc().getAbsolutePath();
00279 if (running.containsKey(key)) {
00280 int ret_val = JOptionPane.showConfirmDialog(frame, "I'm currently"+
00281 "running the scoring pipeline for this assembly. Do you " +
00282 "want me to restart it?");
00283 if (ret_val == JOptionPane.YES_OPTION){
00284 running.get(key).cancel();
00285 running.remove(key).win.closeWindow();
00286 startNewSA(frame);
00287 } else if (ret_val == JOptionPane.NO_OPTION){
00288 running.get(key).win.showWindow();
00289 }
00290 } else {
00291 running.put(key,
00292 new ScoreAssembly(model));
00293 }
00294 }
00295
00296 public static void main(String[] args){
00297 CommandLine line = OptionsBuilder.getCmdLine(getOptions(), args);
00298 if (args.length == 0 || line == null || line.hasOption("help")){
00299 HelpFormatter formatter = new HelpFormatter();
00300 formatter.printHelp(80,
00301 "java -cp Mauve.jar org.gel.mauve.assembly.ScoreAssembly [options]",
00302 "[options]", getOptions(), "report bugs to Aaron Darling <aarondarling@ucdavis.edu>");
00303 System.exit(-1);
00304 }
00305
00306 File outDir = null;
00307 if (line.hasOption("outputDir"))
00308 outDir = new File(line.getOptionValue("outputDir"));
00309 else
00310 outDir = new File(System.getProperty("user.dir"));
00311
00312 boolean batch = false;
00313 if (line.hasOption("batch"))
00314 batch = true;
00315 String basename = null;
00316 if (line.hasOption("basename"))
00317 basename = line.getOptionValue("basename");
00318
00319 File alnmtFile = null;
00320 AssemblyScorer as = null;
00321
00322 if (line.hasOption("alignment")){
00323 if ((line.hasOption("reference") || line.hasOption("assembly"))){
00324 System.err.println("You gave me an alignment along with a reference and/or assembly genome.\n" +
00325 "Do not use use the \"-reference\" and \"-assembly\" flags with the \"-alignment\" flag.");
00326 System.exit(-1);
00327 }
00328 alnmtFile = new File (line.getOptionValue("alignment"));
00329 if (basename == null) {
00330 basename = alnmtFile.getName();
00331
00332 }
00333 as = getAS(alnmtFile);
00334 AssemblyScorer.printInfo(as,outDir,basename,batch);
00335 } else {
00336
00337 if (!line.hasOption("reference")){
00338 System.err.println("Reference file not given.");
00339 System.exit(-1);
00340 } else if (!line.hasOption("assembly")){
00341 System.err.println("Assembly file not given.");
00342 System.exit(-1);
00343 }
00344
00345 File refFile = new File(line.getOptionValue("reference"));
00346 File assPath = new File(line.getOptionValue("assembly"));
00347
00348 if (line.hasOption("reorder")){
00349 String[] reorderParams = new String[8];
00350 reorderParams[0] = "-output";
00351 reorderParams[1] = outDir.getAbsolutePath();
00352 reorderParams[2] = "-ref";
00353 reorderParams[3] = refFile.getAbsolutePath();
00354 reorderParams[4] = "-draft";
00355 reorderParams[5] = assPath.getAbsolutePath();
00356
00357
00358
00359 reorderParams[6] = "--seed-weight=25";
00360 reorderParams[7] = "--solid-seeds";
00361 ContigOrderer co = new ContigOrderer(reorderParams, null, false);
00362 if (basename != null)
00363 as = new AssemblyScorer(co, outDir, basename);
00364 else
00365 as = new AssemblyScorer(co, outDir);
00366 co.addAlignmentProcessListener(as);
00367 } else {
00368 if (basename == null) {
00369 basename = assPath.getName();
00370 }
00371 alnmtFile = new File(outDir, basename+".xmfa");
00372 as = new AssemblyScorer(alnmtFile, outDir, basename);
00373 String[] cmd = makeAlnCmd(refFile,assPath, alnmtFile);
00374 System.out.println("Executing");
00375 AlignFrame.printCommand(cmd, System.out);
00376 AlignWorker worker = new AlignWorker(as, cmd);
00377 worker.start();
00378 }
00379
00380 }
00381 }
00382
00383 private static AssemblyScorer getAS(File alnmtFile){
00384 try {
00385
00386 XmfaViewerModel model = new XmfaViewerModel(alnmtFile,null);
00387 return new AssemblyScorer(model);
00388 } catch (Exception e){
00389 e.printStackTrace();
00390 System.exit(-1);
00391 }
00392 return null;
00393 }
00394
00395 private static String[] makeAlnCmd(File seq1, File seq2, File output){
00396 String[] ret = new String[6 + DEFAULT_ARGS.length];
00397 int j = 0;
00398 ret[j++] = AlignFrame.getBinaryPath("progressiveMauve");
00399 for (int i = 0; i < DEFAULT_ARGS.length; i++)
00400 ret[j++] = DEFAULT_ARGS[i];
00401 ret[j++] = "--output="+output.getAbsolutePath();
00402 ret[j++] = "--backbone-output=" + output.getAbsolutePath()+".backbone";
00403 ret[j++] = "--output-guide-tree=" + output.getAbsolutePath()+".guide_tree";
00404 ret[j++] = seq1.getAbsolutePath();
00405 ret[j++] = seq2.getAbsolutePath();
00406 return ret;
00407 }
00408
00409 @SuppressWarnings("static-access")
00410 private static Options getOptions(){
00411 Options ret = new Options();
00412 OptionsBuilder ob = new OptionsBuilder();
00413 ob.addBoolean("help", "print this message");
00414 ob.addBoolean("batch", "run in batch mode i.e. print summary output " +
00415 "on one line to standard output");
00416 ob.addBoolean("reorder", "reorder contigs before scoring the assembly");
00417 ob.addArgument("string", "basename for output files", "basename",false);
00418 ob.addArgument("directory", "save output in <directory>. Default " +
00419 "is current directory.", "outputDir",true);
00420 ob.addArgument("file", "file containing alignment of assembly to " +
00421 "reference genome", "alignment",false);
00422 ob.addArgument("file", "file containing reference genome", "reference",false);
00423 ob.addArgument("file", "file containing assembly/draft genome to score",
00424 "assembly",false);
00425 return ob.getOptions();
00426 }
00427 }