src/org/gel/mauve/XMFAAlignment.java

Go to the documentation of this file.
00001 package org.gel.mauve;
00002 
00003 import java.io.IOException;
00004 import java.io.RandomAccessFile;
00005 import java.io.Serializable;
00006 import java.util.Properties;
00007 import java.util.Vector;
00008 
00009 import org.gel.mauve.analysis.SnpExporter;
00010 import org.gel.mauve.tree.FileKey;
00011 import org.gel.mauve.tree.GISTree;
00012 import org.gel.mauve.tree.GapKey;
00013 import org.gel.mauve.tree.TreeStore;
00014 
00018 public class XMFAAlignment implements Serializable {
00020         static final long serialVersionUID = 5;
00021 
00022         // The file containing alignment data (transient, can't be serialized)
00023         protected transient RandomAccessFile xmfa_file;
00024 
00025         // The match class instances store the boundaries of each aligned segment
00026         protected Match [] intervals;
00027 
00028         // These store the offset within each file where an interval's sequence
00029         protected Match [] file_pos;
00030 
00031         // The number of newline bytes, as detected on the first line of the file
00032         protected int newline_size = 0;
00033 
00034         // The number of chars on each line
00035         protected int line_width = 80;
00036 
00037         // The number of sequences aligned
00038         protected int seq_count = 0;
00039 
00040         // The length of each sequence
00041         protected long [] seq_length;
00042 
00043         // A list of LCBs contained in this alignment
00044         protected LCB [] lcb_list;
00045 
00046         // A list of LCBs as they appear in the file
00047         private LCB [] source_lcb_list;
00048 
00049         // Array of comment lines for each alignment entry in the XMFA
00050         protected String [] comments;
00051 
00052         // Array of names for each sequence in the first alignment entry
00053         protected String [] names;
00054 
00055         // A set of gist's that map sequence index and column index to file
00056         // offset. indexed by [interval][sequence]
00057         GISTree [][] gis_tree;
00058 
00059         protected TreeStore ts = new TreeStore ();
00060 
00061         // The size of data chunks to read from disk
00062         protected int buffer_size = 500000;
00063 
00064         public Properties metadata = new Properties ();
00065 
00073         public void setFile (RandomAccessFile f) {
00074                 xmfa_file = f;
00075         }
00076 
00081         @SuppressWarnings(/*"deprecation"*/ "unchecked")
00082         public XMFAAlignment (RandomAccessFile ir) throws java.io.IOException {
00083                 xmfa_file = ir;
00084                 // simple parse of the input
00085                 Vector seq_nums = new Vector ();
00086                 Vector lend = new Vector ();
00087                 Vector rend = new Vector ();
00088                 Vector reverse = new Vector ();
00089                 Vector f_offset = new Vector ();
00090                 Vector f_end_offset = new Vector ();
00091                 Vector tmp_ivs = new Vector ();
00092                 Vector tmp_offs = new Vector ();
00093                 Vector tmp_lcbs = new Vector ();
00094                 Vector gis_tree_tmp = new Vector ();
00095                 Vector gist_seqnums = new Vector ();
00096                 Vector gist_ivnums = new Vector ();
00097                 Vector tmp_comments = new Vector ();
00098                 Vector tmp_names = new Vector ();
00099                 String cur_comment = null;
00100 
00102                 long seq_offset = 0;
00103                 GISTree gist = null;
00104 
00105                 FileKey fk = null;
00106                 GapKey gk = null;
00107 
00108                 // XMFA file parse states
00109                 final int wait_defline = 0;
00110                 final int read_defline = 1;
00111                 final int read_comment = 2;
00112                 final int read_sequence = 3;
00113                 final int process_sequence = 4;
00114                 final int read_metadata = 5;
00115 
00116                 byte [] buf = new byte [buffer_size];
00117                 int bufI = 0;
00118                 int remaining_bytes = 0;
00119                 long cur_base = 0;
00120                 int state = wait_defline;
00121                 int section_start = buffer_size;
00122 
00123                 // detect the number of bytes in a newline
00124                 xmfa_file.seek (0);
00125                 String first_line = xmfa_file.readLine ();
00126                 newline_size = (int) (xmfa_file.getFilePointer () - first_line
00127                                 .length ());
00128                 xmfa_file.seek (0);
00129 
00130                 // begin parse loop
00131                 while (true) {
00132                         if (remaining_bytes == 0) {
00133                                 // it's important to preserve the defline read thus far
00134                                 // shift it to the beginning of the array and then read
00135                                 // from that point onward
00136                                 if (state != read_defline && state != read_comment)
00137                                         section_start = buf.length;
00138                                 int copy_len = buf.length - section_start;
00139                                 System.arraycopy (buf, section_start, buf, 0, copy_len);
00140 
00141                                 if (state == read_defline || state == read_comment)
00142                                         section_start = 0;
00143 
00144                                 // read more data into the buffer
00145                                 cur_base += bufI;
00146                                 xmfa_file.seek (cur_base);
00147                                 remaining_bytes = xmfa_file.read (buf, copy_len, buf.length
00148                                                 - copy_len);
00149                                 cur_base -= copy_len;
00150                                 if (remaining_bytes <= 0)
00151                                         break;
00152                                 bufI = copy_len; // continue wherever we left off
00153                         }
00154 
00155                         switch (state) {
00156                                 case wait_defline:
00157                                         if (buf[bufI] == '#') {
00158                                                 // read metadata or a user comment
00159                                                 section_start = bufI + 1;
00160                                                 state = read_metadata;
00161                                         }
00162                                         if (buf[bufI] == '>') {
00163                                                 section_start = bufI;
00164                                                 state = read_defline;
00165                                                 bufI--;
00166                                                 remaining_bytes++;
00167                                         }
00168                                         break;
00169                                 case read_metadata:
00170                                         if (buf[bufI] == '\r' || buf[bufI] == '\n') {
00171                                                 String cur_line = new String (buf, section_start, bufI
00172                                                                 - section_start);
00173 
00174                                                 // is this a sequence count specifier?
00175                                                 int sc_index = cur_line.indexOf ("SequenceCount");
00176                                                 if (sc_index >= 0) {
00177                                                         seq_count = Integer.parseInt (cur_line.substring (
00178                                                                         sc_index + 13).trim ());
00179                                                 } else
00180                                                 // Otherwise, add it to the properties collection.
00181                                                 {
00182                                                         String [] parts = cur_line.split ("\\s", 2);
00183 
00184                                                         if (parts.length == 0) {
00185                                                                 // Do nothing
00186                                                         } else if (parts.length == 1) {
00187                                                                 metadata.setProperty (parts[0].trim (), "");
00188                                                         } else {
00189                                                                 metadata.setProperty (parts[0].trim (),
00190                                                                                 parts[1].trim ());
00191                                                         }
00192                                                 }
00193                                                 // go back to waiting for a defline
00194                                                 state = wait_defline;
00195                                         }
00196                                         break;
00197                                 // when a newline rolls around, parse out the defline
00198                                 case read_defline:
00199                                         if (buf[bufI] == '\r' || buf[bufI] == '\n') {
00200                                                 // read the goods into cur_line
00201                                                 String cur_line = new String (buf, section_start, bufI
00202                                                                 - section_start);
00203 
00204                                                 // parse a sequence left-end, right-end, and strand
00205                                                 // combination
00206                                                 // > number:start-end strand(+/-)
00207                                                 int colon_pos = cur_line.indexOf (':');
00208                                                 int seq_num = -1;
00209                                                 java.util.StringTokenizer seq_num_strtok = new java.util.StringTokenizer (
00210                                                                 cur_line.substring (1, colon_pos));
00211                                                 if (seq_num_strtok.hasMoreTokens ()) {
00212                                                         seq_num = Integer.parseInt (seq_num_strtok
00213                                                                         .nextToken ()) - 1;
00214                                                         seq_nums.addElement (new Integer (seq_num));
00215                                                 }
00216                                                 String tmp_str = cur_line.substring (colon_pos + 1);
00217                                                 int dash_pos = tmp_str.indexOf ('-');
00218                                                 int space_pos = tmp_str.indexOf (' ');
00219                                                 long first = Long.parseLong (tmp_str.substring (0,
00220                                                                 dash_pos));
00221                                                 long second = Long.parseLong (tmp_str.substring (
00222                                                                 dash_pos + 1, space_pos));
00223                                                 long left = first < second ? first : second;
00224                                                 long right = first < second ? second : first;
00225                                                 boolean cur_reverse = true;
00226                                                 // parse strand -- if + isn't found then assume - strand
00227                                                 if (tmp_str.indexOf ('+', space_pos + 1) < 0)
00228                                                         reverse.addElement (new Boolean (true));
00229                                                 else {
00230                                                         reverse.addElement (new Boolean (false));
00231                                                         cur_reverse = false;
00232                                                 }
00233                                                 lend.addElement (new Long (left));
00234                                                 rend.addElement (new Long (right));
00235 
00236                                                 // if we haven't yet completed the first alignment entry
00237                                                 // try to parse the sequence name
00238                                                 if (tmp_comments.size () == 0) {
00239                                                         int strand_pos = 0;
00240                                                         if (!cur_reverse)
00241                                                                 strand_pos = tmp_str.indexOf ('+',
00242                                                                                 space_pos + 1);
00243                                                         else
00244                                                                 strand_pos = tmp_str.indexOf ('-',
00245                                                                                 space_pos + 1);
00246 
00247                                                         if (tmp_str.length () > strand_pos + 2)
00248                                                                 // there's a name to parse
00249                                                                 tmp_names.add (tmp_str
00250                                                                                 .substring (strand_pos + 2));
00251                                                         else
00252                                                                 tmp_names.add (new String (""));
00253                                                 }
00254 
00255                                                 // prepare for seq parsing
00256                                                 seq_offset = 0;
00257                                                 state = read_sequence;
00258 
00259                                                 gist = new GISTree (ts);
00260                                                 gis_tree_tmp.addElement (gist);
00261                                                 gist_seqnums.addElement (new Integer (seq_num));
00262                                                 gist_ivnums.addElement (new Integer (tmp_ivs.size ()));
00263                                         }
00264                                         break;
00265 
00266                                 // ride out the comment
00267                                 case read_comment:
00268                                         if (buf[bufI] == '\r' || buf[bufI] == '\n') {
00269                                                 // store the comment
00270                                                 if (cur_comment == null) {
00271                                                         cur_comment = new String (buf, section_start, bufI
00272                                                                         - section_start);
00273                                                         tmp_comments.add (cur_comment);
00274                                                 }
00275                                                 if (buf[bufI] == '\n') {
00276                                                         state = wait_defline;
00277                                                         cur_comment = null;
00278                                                 }
00279                                         }
00280                                         break;
00281 
00282                                 // wait for a sequence entry to finish, add gist keys, etc.
00283                                 case read_sequence:
00284                                         if (buf[bufI] == '>' || buf[bufI] == '=') {
00285                                                 state = process_sequence;
00286                                                 remaining_bytes++;
00287                                                 bufI--;
00288                                                 break;
00289                                         }
00290 
00291                                         // do nothing on newlines
00292                                         if (buf[bufI] == '\r' || buf[bufI] == '\n') {
00293                                                 break;
00294                                         }
00295 
00296                                         // set the sequence data file offset if this is the
00297                                         // beginning of
00298                                         // the entry
00299                                         if (fk == null && gk == null) {
00300                                                 f_offset.addElement (new Long (cur_base + bufI));
00301                                         }
00302 
00303                                         // it's all sequence data
00304                                         long cur_len = gist.length ();
00305                                         if (buf[bufI] == '-') {
00306                                                 if (fk != null) {
00307                                                         gist.insert (fk, GISTree.end);
00308                                                         if (cur_len + fk.getLength () != gist.length ()) {
00309                                                                 throw new RuntimeException ("Corrupt GisTree.");
00310                                                         }
00311                                                         fk = null;
00312                                                 }
00313                                                 if (gk == null)
00314                                                         gk = new GapKey (0);
00315                                                 gk.length++;
00316                                         } else {
00317                                                 if (gk != null) {
00318                                                         gist.insert (gk, GISTree.end);
00319                                                         if (cur_len + gk.getLength () != gist.length ()) {
00320                                                                 throw new RuntimeException ("Corrupt GisTree");
00321                                                         }
00322                                                         gk = null;
00323                                                 }
00324                                                 if (fk == null) {
00325                                                         fk = new FileKey (seq_offset, cur_base + bufI);
00326                                                 }
00327                                                 fk.incrementLength ();
00328                                                 fk.setFLength (cur_base + bufI - fk.getOffset () + 1);
00329                                         }
00330 
00331                                         break;
00332 
00333                                 // process a completed sequence entry
00334                                 case process_sequence:
00335 
00336                                         // do final processing from a previously parsed sequence
00337                                         long curry_len = gist.length ();
00338                                         if (fk != null) {
00339                                                 gist.insert (fk, GISTree.end);
00340                                                 if (curry_len + fk.getLength () != gist.length ()) {
00341                                                         throw new RuntimeException ("Corrupt GisTree");
00342                                                 }
00343                                         }
00344                                         if (gk != null) {
00345                                                 gist.insert (gk, GISTree.end);
00346                                                 if (curry_len + gk.getLength () != gist.length ()) {
00347                                                         throw new RuntimeException ("Corrupt GisTree");
00348                                                 }
00349                                         }
00350                                         fk = null;
00351                                         gk = null;
00352 
00353                                         // sequence ends here
00354                                         f_end_offset.addElement (new Long (cur_base + bufI));
00355 
00356                                         if (buf[bufI] == '>') {
00357                                                 section_start = bufI;
00358                                                 state = read_defline;
00359                                                 remaining_bytes++;
00360                                                 bufI--;
00361                                         } else if (buf[bufI] == '=') {
00362                                                 // process a sequence set!
00363 
00364                                                 if (seq_count == 0)
00365                                                         seq_count = lend.size ();
00366                                                 // create a new Match element
00367                                                 Match m = new Match (seq_count);
00368                                                 // copy file offsets
00369                                                 Match f_off_m = new Match (seq_count);
00370                                                 // copy left ends into start
00371                                                 int aligned_count = 0;
00372                                                 // only set values for sequence numbers that were
00373                                                 // actually
00374                                                 // part
00375                                                 // of this interval
00376                                                 for (int seq_numI = 0; seq_numI < seq_nums.size (); seq_numI++) {
00377                                                         int seqI = ((Integer) seq_nums.elementAt (seq_numI))
00378                                                                         .intValue ();
00379                                                         m.setStart (seqI,
00380                                                                         ((Long) lend.elementAt (seq_numI))
00381                                                                                         .longValue ());
00382                                                         if (m.getStart (seqI) != 0)
00383                                                                 aligned_count++;
00384                                                         m.setLength (seqI, ((Long) rend
00385                                                                         .elementAt (seq_numI)).longValue ());
00386                                                         m.setReverse (seqI, ((Boolean) reverse
00387                                                                         .elementAt (seq_numI)).booleanValue ());
00388                                                         f_off_m.setStart (seqI, ((Long) f_offset
00389                                                                         .elementAt (seq_numI)).longValue ());
00390                                                         f_off_m.setLength (seqI, ((Long) f_end_offset
00391                                                                         .elementAt (seq_numI)).longValue ());
00392                                                 }
00393                                                 // if this interval contains alignment of more than one
00394                                                 // sequence then call it an LCB
00395                                                 // there may be problems with the assumption that an
00396                                                 // interval will always contain
00397                                                 // some aligned sequence if it has more than one set of
00398                                                 // genome coordinates defined,
00399                                                 // but let's not worry about that now since mauveAligner
00400                                                 // won't have that problem
00401                                                 if (aligned_count > 1) {
00402                                                         LCB lcb = new LCB (m, tmp_lcbs.size (), seq_count);
00403                                                         tmp_lcbs.addElement (lcb);
00404                                                 }
00405 
00406                                                 // add to tmp_ivs
00407                                                 tmp_ivs.addElement (m);
00408                                                 tmp_offs.addElement (f_off_m);
00409 
00410                                                 // clear data structures for the next alignment interval
00411                                                 lend = new Vector ();
00412                                                 rend = new Vector ();
00413                                                 reverse = new Vector ();
00414                                                 f_offset = new Vector ();
00415                                                 f_end_offset = new Vector ();
00416                                                 seq_nums = new Vector ();
00417 
00418                                                 state = read_comment;
00419                                                 section_start = bufI;
00420                                                 remaining_bytes++;
00421                                                 bufI--;
00422                                         }
00423                                         break;
00424 
00425                         }
00426 
00427                         bufI++;
00428                         remaining_bytes--;
00429                 }
00430 
00431                 intervals = new Match [tmp_ivs.size ()];
00432                 intervals = (Match []) tmp_ivs.toArray (intervals);
00433                 file_pos = new Match [tmp_offs.size ()];
00434                 file_pos = (Match []) tmp_offs.toArray (file_pos);
00435                 lcb_list = new LCB [tmp_lcbs.size ()];
00436                 lcb_list = (LCB []) tmp_lcbs.toArray (lcb_list);
00437                 setSourceLcbList(new LCB [lcb_list.length]);
00438                 for(int ll = 0; ll < lcb_list.length; ll++)
00439                         getSourceLcbList()[ll] = new LCB(lcb_list[ll]);
00440                 seq_length = new long [seq_count];
00441                 gis_tree = new GISTree [intervals.length] [seq_count];
00442                 for (int gistI = 0; gistI < gis_tree_tmp.size (); gistI++) {
00443                         int ivI = ((Integer) gist_ivnums.elementAt (gistI)).intValue ();
00444                         int seqI = ((Integer) gist_seqnums.elementAt (gistI)).intValue ();
00445                         gis_tree[ivI][seqI] = (GISTree) gis_tree_tmp.elementAt (gistI);
00446                 }
00447 
00448                 for (int seqI = 0; seqI < seq_count; seqI++) {
00449                         long seq_len = 0;
00450                         int treeI = 0;
00451                         for (int ivI = 0; ivI < intervals.length; ivI++) {
00452                                 if (intervals[ivI].getLength (seqI) > seq_len) {
00453                                         seq_len = intervals[ivI].getLength (seqI);
00454                                 }
00455                                 if (gis_tree[ivI][seqI] == null) {
00456                                         gis_tree[ivI][seqI] = new GISTree (ts);
00457                                         // insert a GapKey of the full length of this interval
00458                                         long iv_length = 0;
00459                                         for (int seqJ = 0; seqJ < seq_count; seqJ++) {
00460                                                 if (gis_tree[ivI][seqJ] != null
00461                                                                 && iv_length < gis_tree[ivI][seqJ].length ()) {
00462                                                         iv_length = gis_tree[ivI][seqJ].length ();
00463                                                 }
00464                                         }
00465                                         gis_tree[ivI][seqI].insert (new GapKey (iv_length), 0);
00466                                 }
00467                                 treeI++;
00468                         }
00469                         seq_length[seqI] = seq_len;
00470                 }
00471 
00472                 names = new String [tmp_names.size ()];
00473                 names = (String []) tmp_names.toArray (names);
00474 
00475                 comments = new String [tmp_comments.size ()];
00476                 comments = (String []) tmp_comments.toArray (comments);
00477 
00478                 // Prune the arrays.
00479                 ts.pruneArrays ();
00480         }
00481 
00490         public String getComment (int lcbI) {
00491                 return comments[lcbI];
00492         }
00493 
00501         public String getName (int seqI) {
00502                 return names[seqI];
00503         }
00504 
00521         public byte[][] getRange (Genome g, long lend, long rend) {
00522                 long cur_offset = lend;
00523                 byte[][] cols = new byte [seq_count][];
00524                 int seqJ = 0;
00525                 for (seqJ = 0; seqJ < seq_count; seqJ++)
00526                         cols[seqJ] = new byte [0];
00527 
00528                 while (cur_offset <= rend) {
00529                         // determine which LCB we start in
00530                         int ivI = getLCB (g, cur_offset);
00531 
00532                         // determine the file offset of the left end of this sequence
00533                         // read a bunch o' columns
00534                         long cur_iv_lend = intervals[ivI].getStart (g);
00535                         long read_size = intervals[ivI].getLength (g) - rend > 0 ? rend
00536                                         - cur_offset + 1 : intervals[ivI].getLength (g)
00537                                         - cur_offset + 1;
00538                         while (read_size > 0) {
00539                                 // assume forward orientation
00540                                 // plan b: get the range of columns that need to be read
00541                                 // for each seq, read the cols directly into a byte buffer
00542                                 // reverse the sequences if necessary
00543                                 long lcb_offset = cur_offset - cur_iv_lend;
00544                                 long lcb_right_offset = intervals[ivI].getReverse (g) ? intervals[ivI]
00545                                                 .getLength (g)
00546                                                 - intervals[ivI].getStart (g) - lcb_offset + 1
00547                                                 : lcb_offset + read_size;
00548                                 lcb_offset = intervals[ivI].getReverse (g) ? lcb_right_offset
00549                                                 - read_size : lcb_offset;
00550 
00551                                 long fk_left_col = gis_tree[ivI][g.getSourceIndex ()]
00552                                                 .seqPosToColumn (lcb_offset);
00553                                 long fk_right_col = gis_tree[ivI][g.getSourceIndex ()]
00554                                                 .seqPosToColumn (lcb_right_offset);
00555 
00556                                 byte[][] byte_bufs = new byte [seq_count][];
00557                                 for (seqJ = 0; seqJ < seq_count; seqJ++) {
00558                                         byte_bufs[seqJ] = readRawSequence (ivI, seqJ, fk_left_col,
00559                                                         fk_right_col - fk_left_col);
00560                                 }
00561 
00562                                 // just assume that the correct number of columns was read
00563                                 cur_offset += read_size;
00564                                 read_size = 0;
00565 
00566                                 // reverse the columns if necessary
00567                                 if (intervals[ivI].getReverse (g)) {
00568                                         for (seqJ = 0; seqJ < seq_count; seqJ++) {
00569                                                 reverse (byte_bufs[seqJ]);
00570                                         }
00571                                 }
00572                                 // append the columns to cols
00573                                 for (seqJ = 0; seqJ < seq_count; seqJ++) {
00574                                         byte [] tmp_buf = new byte [cols[seqJ].length + byte_bufs[seqJ].length];
00575                                         System.arraycopy (cols[seqJ], 0, tmp_buf, 0, cols[seqJ].length);
00576                                         System.arraycopy (byte_bufs[seqJ], 0, tmp_buf, cols[seqJ].length, byte_bufs[seqJ].length);
00577                                         cols[seqJ] = tmp_buf;
00578                                 }
00579                         }
00580                 }
00581                 return cols;
00582         }
00583 
00588         void reverse (byte [] byte_buf) {
00589                 for (int byteI = 0; byteI < byte_buf.length / 2; byteI++) {
00590                         byte tmp = (byte) SnpExporter.revtab[byte_buf[byteI]];
00591                         byte_buf[byteI] = (byte) SnpExporter.revtab[ byte_buf[byte_buf.length - byteI - 1] ];
00592                         byte_buf[byte_buf.length - byteI - 1] = tmp;                    
00593                 }
00594                 if(byte_buf.length%2==1){
00595                         byte_buf[byte_buf.length/2] = (byte) SnpExporter.revtab[ byte_buf[byte_buf.length/2] ];
00596                 }
00597         }
00598 
00603         int filterGapsInOneSequence (int seqI, Object [] byte_bufs) {
00604                 // filter gaps from a particular sequence while maintaining column
00605                 // integrity
00606                 int col_offset = 0;
00607                 for (int colI = 0; colI < ((byte []) byte_bufs[seqI]).length; colI++) {
00608                         if (((byte []) byte_bufs[seqI])[colI] != '-') {
00609                                 // copy the column to the current col_offset
00610                                 for (int seqJ = 0; seqJ < seq_count; seqJ++) {
00611                                         ((byte []) byte_bufs[seqJ])[col_offset] = ((byte []) byte_bufs[seqJ])[colI];
00612                                 }
00613                                 col_offset++;
00614                         }
00615                 }
00616                 return col_offset;
00617         }
00618 
00632         public byte [] readRawSequence (int ivI, int seqI, long left_col, long length) {
00633                 // check boundary condition
00634                 if (gis_tree[ivI][seqI].length () == 0) {
00635                         if (length == 0)
00636                                 return new byte [0];
00637                         else
00638                                 throw new ArrayIndexOutOfBoundsException ();
00639                 }
00640 
00641                 int l_iter = gis_tree[ivI][seqI].find (left_col);
00642                 int r_iter = gis_tree[ivI][seqI].find (left_col + length - 1);
00643                 FileKey l_fk, r_fk;
00644                 long l_gaps = 0;
00645 
00646                 // if the requested region contains nothing but gap then just return a
00647                 // buffer of gaps
00648                 long seq_off = gis_tree[ivI][seqI].getSequenceStart (l_iter);
00649                 if (gis_tree[ivI][seqI].sequenceLength () <= left_col
00650                                 && gis_tree[ivI][seqI].getSeqLength (l_iter) == 0
00651                                 && seq_off <= left_col) {
00652                         if (left_col + length - 1 > gis_tree[ivI][seqI].length ())
00653                                 throw new ArrayIndexOutOfBoundsException ();
00654                         byte [] bb = new byte [(int) length];
00655                         for (int bbI = 0; bbI < bb.length; bbI++)
00656                                 bb[bbI] = (byte) '-';
00657                         return bb;
00658                 }
00659 
00660                 // check for the case where the requested region lies within
00661                 // the same gap in the middle of the sequence
00662                 if (gis_tree[ivI][seqI].getKey (l_iter) instanceof GapKey
00663                                 && gis_tree[ivI][seqI].getKey (r_iter) instanceof GapKey
00664                                 && gis_tree[ivI][seqI].getKey (l_iter) == gis_tree[ivI][seqI]
00665                                                 .getKey (r_iter)) {
00666                         byte [] bb = new byte [(int) length];
00667                         for (int bbI = 0; bbI < bb.length; bbI++)
00668                                 bb[bbI] = (byte) '-';
00669                         return bb;
00670                 }
00671 
00672                 if (gis_tree[ivI][seqI].getKey (l_iter) instanceof FileKey) {
00673                         l_fk = (FileKey) gis_tree[ivI][seqI].getKey (l_iter);
00674                 } else {
00675                         // if we started in a gap the next one should be a FileKey
00676                         seq_off = gis_tree[ivI][seqI].getSequenceStart (l_iter);
00677                         l_iter = gis_tree[ivI][seqI].find_seqindex (seq_off);
00678                         l_fk = (FileKey) gis_tree[ivI][seqI].getKey (l_iter);
00679                 }
00680 
00681                 if (gis_tree[ivI][seqI].getKey (r_iter) instanceof FileKey) {
00682                         r_fk = (FileKey) gis_tree[ivI][seqI].getKey (r_iter);
00683                 } else {
00684                         // if we started in a gap the next one should be a FileKey
00685                         seq_off = gis_tree[ivI][seqI].getSequenceStart (r_iter) - 1;
00686                         r_iter = gis_tree[ivI][seqI].find_seqindex (seq_off);
00687                         r_fk = (FileKey) gis_tree[ivI][seqI].getKey (r_iter);
00688                 }
00689 
00690                 // should have l_fk and r_fk now.
00691                 // get the file offsets to read from these
00692                 long l_off = left_col - gis_tree[ivI][seqI].getStart (l_iter);
00693                 // calculate the number of newlines in the space between the first
00694                 // desired
00695                 // column and the first character in this key...
00696                 long key_col_pos = gis_tree[ivI][seqI].getStart (l_iter) % line_width;
00697                 long l_newlines = ((key_col_pos + l_off) / 80) * newline_size;
00698                 if (l_off < 0) {
00699                         // this happens when the desired column is a gap. just skip ahead
00700                         l_gaps = -l_off; // track how many gaps we'll need later
00701                         l_newlines = 0;
00702                         l_off = 0;
00703                 }
00704                 l_off += l_fk.getOffset () + l_newlines;
00705 
00706                 long r_off = left_col + length - 1
00707                                 - gis_tree[ivI][seqI].getStart (r_iter);
00708                 long r_key_col = gis_tree[ivI][seqI].getStart (r_iter) % line_width;
00709                 long r_newlines = ((r_key_col + r_off) / line_width) * newline_size;
00710                 r_off += r_fk.getOffset () + r_newlines;
00711 
00712                 // now that the exact file offsets of the desired sequence have been
00713                 // calculated, read it from the XMFA.
00714                 if (r_off - l_off + 1 + l_gaps < 0) {
00715                         throw new RuntimeException ("Unexpected Error.");
00716                 }
00717                 byte [] byte_buf = new byte [(int) (r_off - l_off + 1 + l_gaps)];
00718                 for (int l_gapI = 0; l_gapI < l_gaps; l_gapI++) {
00719                         byte_buf[l_gapI] = (byte) '-';
00720                 }
00721 
00722                 try {
00723                         xmfa_file.seek (l_off);
00724                         xmfa_file.read (byte_buf, (int) l_gaps, byte_buf.length
00725                                         - (int) l_gaps);
00726                 } catch (IOException e) {
00727                         throw new RuntimeException ("Unexpected file reading error.", e);
00728                 }
00729                 return byte_buf;
00730         }
00731 
00732         static public byte [] filterNewlines (byte [] byte_buf) {
00733                 // filter the newlines
00734                 int byte_off = 0;
00735                 for (int byteI = 0; byteI < byte_buf.length; byteI++) {
00736                         if (byte_buf[byteI] == '\r' || byte_buf[byteI] == '\n')
00737                                 continue;
00738                         byte_buf[byte_off] = byte_buf[byteI];
00739                         byte_off++;
00740                 }
00741                 byte [] bb2 = new byte [byte_off];
00742                 System.arraycopy (byte_buf, 0, bb2, 0, byte_off);
00743                 return bb2;
00744         }
00745 
00750         int getLCB (Genome g, long position) {
00751                 int ivI = 0;
00752                 for (; ivI < intervals.length; ivI++) {
00753                         if (intervals[ivI].getStart (g) <= position
00754                                         && position <= intervals[ivI].getLength (g))
00755                                 break; // found the starting interval -- remember lengths array
00756                         // contains r_end
00757                 }
00758                 // throw an exception if the requested range couldn't be found
00759                 if (ivI == intervals.length)
00760                         throw new ArrayIndexOutOfBoundsException ("genome " + g
00761                                         + " position " + position);
00762                 return ivI;
00763         }
00764 
00765         // FIXME: get rev. comp right in these two functions!
00766         long revCompify (long position, Genome g, int ivI) {
00767                 try {
00768                         return intervals[ivI].getReverse (g) ? intervals[ivI].getLength (g)
00769                                         - intervals[ivI].getStart (g) - position : position;
00770                 } catch (Exception e) {
00771                         e.printStackTrace ();
00772                         return -1;
00773                 }
00774         }
00775         
00780         long globalToLCB (long position, Genome g, int ivI) {
00781                 long offset = position - intervals[ivI].getStart (g);
00782                 return revCompify (offset, g, ivI);
00783         }
00784 
00785         // 
00786         
00791         long LCBToGlobal (long position, Genome g, int ivI) {
00792                 long offset = revCompify (position, g, ivI);
00793                 offset += intervals[ivI].getStart (g);
00794                 if (intervals[ivI].getStart (g) == 0)
00795                         return 0;
00796                 return offset;
00797         }
00798 
00806         public long [] getLCBAndColumn (Genome g, long position) {
00807                 // determine the LCB
00808                 int ivI = getLCB (g, position);
00809 
00810                 long lcb_offset = globalToLCB (position, g, ivI);
00811                 long fk_left_col = gis_tree[ivI][g.getSourceIndex ()]
00812                                 .seqPosToColumn (lcb_offset);
00813 
00814                 long [] lcb_and_col = new long [2];
00815                 lcb_and_col[0] = ivI;
00816                 lcb_and_col[1] = fk_left_col;
00817                 return lcb_and_col;
00818         }
00819 
00829         public void getColumnCoordinates (XmfaViewerModel model, int lcb,
00830                         long column, long [] seq_offsets, boolean [] gap) {
00831                 for (int seqI = 0; seqI < seq_count; seqI++) {
00832                         Genome g = model.getGenomeBySourceIndex (seqI);
00833 
00834                         seq_offsets[seqI] = gis_tree[lcb][g.getSourceIndex ()]
00835                                         .columnToSeqPos (column);
00836                         gap[seqI] = column != gis_tree[lcb][g.getSourceIndex ()]
00837                                         .seqPosToColumn (seq_offsets[seqI]);
00838                         seq_offsets[seqI] = LCBToGlobal (seq_offsets[seqI], g, lcb);
00839                         if(seq_offsets[seqI]>g.getLength())
00840                                 gap[seqI]=true;
00841                 }
00842         }
00848         public long getCoordinate (XmfaViewerModel model, Genome g, int lcb,
00849                         long column, Boolean gap) {
00850 
00851                 long seq_offset = gis_tree[lcb][g.getSourceIndex ()]
00852                                 .columnToSeqPos (column);
00853                 gap = column != gis_tree[lcb][g.getSourceIndex ()]
00854                                 .seqPosToColumn (seq_offset);
00855                 seq_offset = LCBToGlobal (seq_offset, g, lcb);
00856                 return seq_offset;
00857         }
00858         
00864         public long getLcbLength(int lcbId)
00865         {
00866                 return gis_tree[lcbId][0].length();
00867         }
00868 
00872         public void setReference (Genome g) {
00873                 for (int lcbI = 0; lcbI < lcb_list.length; lcbI++) {
00874                         lcb_list[lcbI].setReference (g);
00875                 }
00876         }
00877 
00878         public void setSourceLcbList(LCB [] source_lcb_list) {
00879                 this.source_lcb_list = source_lcb_list;
00880         }
00881 
00882         public LCB [] getSourceLcbList() {
00883                 return source_lcb_list;
00884         }
00885 }

Generated on Mon Aug 19 06:03:44 2013 for Mauve by doxygen 1.3.6