libMems/SortedMerList.cpp

Go to the documentation of this file.
00001 /*******************************************************************************
00002  * $Id: SortedMerList.cpp,v 1.23 2004/03/01 02:40:08 darling Exp $
00003  * This file is copyright 2002-2007 Aaron Darling and authors listed in the AUTHORS file.
00004  * Please see the file called COPYING for licensing, copying, and modification
00005  * Please see the file called COPYING for licensing details.
00006  * **************
00007  ******************************************************************************/
00008 
00009 #ifdef HAVE_CONFIG_H
00010 #include "config.h"
00011 #endif
00012 
00013 #include "libMems/SortedMerList.h"
00014 
00015 using namespace std;
00016 using namespace genome;
00017 namespace mems {
00018 
00019 const uint8* SortedMerList::BasicDNATable(){
00020         static const uint8* const bdt = SortedMerList::CreateBasicDNATable();
00021         return bdt;
00022 }
00023 
00024 const uint8* SortedMerList::ProteinTable(){
00025         static const uint8* const bdt = SortedMerList::CreateProteinTable();
00026         return bdt;
00027 }
00028 
00029 const uint8* SortedMerList::CreateBasicDNATable(){
00030         uint8* bdt = new uint8[UINT8_MAX];
00031         memset(bdt, 0, UINT8_MAX);
00032         bdt['c'] = 1;
00033         bdt['C'] = 1;
00034         bdt['b'] = 1;
00035         bdt['B'] = 1;
00036         bdt['y'] = 1;
00037         bdt['Y'] = 1;
00038         bdt['g'] = 2;
00039         bdt['G'] = 2;
00040         bdt['s'] = 2;
00041         bdt['S'] = 2;
00042         bdt['k'] = 2;
00043         bdt['K'] = 2;
00044         bdt['t'] = 3;
00045         bdt['T'] = 3;
00046         return bdt;
00047 }
00048 
00049 const uint8* SortedMerList::CreateProteinTable(){
00050         uint8* pt = new uint8[UINT8_MAX];
00051         memset(pt, 0, UINT8_MAX);
00052         pt['A'] = 0;
00053         pt['R'] = 1;
00054         pt['N'] = 2;
00055         pt['D'] = 3;
00056         pt['C'] = 4;
00057         pt['Q'] = 5;
00058         pt['E'] = 6;
00059         pt['G'] = 7;
00060         pt['H'] = 8;
00061         pt['I'] = 9;
00062         pt['L'] = 10;
00063         pt['K'] = 11;
00064         pt['M'] = 12;
00065         pt['F'] = 13;
00066         pt['P'] = 14;
00067         pt['S'] = 15;
00068         pt['T'] = 16;
00069         pt['W'] = 17;
00070         pt['Y'] = 18;
00071         pt['V'] = 19;
00072         
00073         pt['a'] = 0;
00074         pt['r'] = 1;
00075         pt['n'] = 2;
00076         pt['d'] = 3;
00077         pt['c'] = 4;
00078         pt['q'] = 5;
00079         pt['e'] = 6;
00080         pt['g'] = 7;
00081         pt['h'] = 8;
00082         pt['i'] = 9;
00083         pt['l'] = 10;
00084         pt['k'] = 11;
00085         pt['m'] = 12;
00086         pt['f'] = 13;
00087         pt['p'] = 14;
00088         pt['s'] = 15;
00089         pt['t'] = 16;
00090         pt['w'] = 17;
00091         pt['y'] = 18;
00092         pt['v'] = 19;
00093         return pt;
00094 }
00095 
00096 SortedMerList::SortedMerList(){
00097         //default to BasicDNA settings
00098         header.length = 0;
00099         header.alphabet_bits = 2;
00100         header.unique_mers = NO_UNIQUE_COUNT;
00101         memcpy(header.translation_table, BasicDNATable(), UINT8_MAX);
00102         header.description[0] = 0;
00103         header.seed_length = DNA_MER_SIZE;
00104         header.id = 0;
00105         header.circular = false;
00106         mask_size = DNA_MER_SIZE;
00107         mer_mask = 0;
00108         seed_mask = 0;
00109         // init sequence data to null
00110         sequence = NULL;
00111         binary_seq_len = 0;
00112 }
00113 
00114 SortedMerList::SortedMerList( const SortedMerList& sa ){
00115         sequence = NULL;
00116         *this = sa;
00117 }
00118 
00119 SortedMerList& SortedMerList::operator=(const SortedMerList& sa)
00120 {
00121         header = sa.header;
00122         mer_mask = sa.mer_mask;
00123         seed_mask = sa.seed_mask;
00124         mask_size = sa.mask_size;
00125         binary_seq_len = sa.binary_seq_len;
00126 
00127         // copy binary sequence data
00128         if( sa.sequence != NULL ){
00129                 if( sequence != NULL )
00130                         delete[] sequence;
00131                 sequence = new uint32[binary_seq_len];
00132                 memcpy(sequence, sa.sequence, sizeof(uint32) * binary_seq_len);
00133         }else
00134                 sequence = NULL;
00135 
00136         return *this;
00137 }
00138 
00139 SortedMerList::~SortedMerList(){
00140         if( sequence != NULL )
00141                 delete[] sequence;
00142 }
00143 
00144 void SortedMerList::Clear(){
00145         //default to BasicDNA settings
00146         header.length = 0;
00147         header.alphabet_bits = 2;
00148         header.unique_mers = NO_UNIQUE_COUNT;
00149         memcpy(header.translation_table, BasicDNATable(), UINT8_MAX);
00150         header.description[0] = 0;
00151         header.seed_length = DNA_MER_SIZE;
00152         header.id = 0;
00153         header.circular = false;
00154         mask_size = DNA_MER_SIZE;
00155         mer_mask = 0;
00156         seed_mask = 0;
00157         // delete sequence data
00158         if( sequence != NULL ){
00159                 delete[] sequence;
00160                 sequence = NULL;
00161         }
00162         binary_seq_len = 0;
00163 }
00164 
00165 uint32 SortedMerList::CalculateMaxMerSize() const{
00166         bmer tmp;
00167         return (sizeof(tmp.mer) * 8) / header.alphabet_bits;
00168 }
00169 
00170 boolean SortedMerList::FindMer(const uint64 query_mer, gnSeqI& result){
00171         bmer merle;
00172         merle.mer = query_mer;
00173         gnSeqI last_pos = Length();
00174         if( last_pos == 0 || (last_pos < header.seed_length && !header.circular) )
00175                 return false;
00176         last_pos -= header.circular ? 1 : header.seed_length;
00177         result = bsearch(merle, 0, last_pos );
00178         return ((*this)[result].mer == merle.mer);
00179 }
00180 
00181 boolean SortedMerList::Find(const string& query_seq, gnSeqI& result) {
00182         struct bmer merle;
00183         merle.mer = 0;
00184 
00185         //check the length to make sure it is small enough
00186         gnSeqI len = query_seq.length() * header.alphabet_bits < 64 ? 
00187                 query_seq.length() : 64 / header.alphabet_bits;
00188                 
00189         translate((uint8*)&merle.mer, query_seq.c_str(), len);
00190         return FindMer( merle.mer, result );
00191 }
00192 
00193 void SortedMerList::FindAll(const string& query_seq, vector<gnSeqI> result) {
00194         struct bmer merle;
00195         merle.mer = 0;
00196 
00197         //check the length to make sure it is small enough
00198         gnSeqI len = query_seq.length() * header.alphabet_bits < 64 ? 
00199                 query_seq.length() : 64 / header.alphabet_bits;
00200                 
00201         translate((uint8*)&merle.mer, query_seq.c_str(), len);
00202         
00203         //find the first match then start filling forward.
00204         gnSeqI matchI = 0;
00205         gnSeqI last_pos = Length();
00206         last_pos -= header.circular ? 1 : header.seed_length;
00207         bmer matchmer;
00208         matchI = bsearch(merle, 0, last_pos);
00209 
00210         //first seek backwards
00211         int64 cur_matchI = matchI;
00212         matchmer = (*this)[matchI];
00213         while(cur_matchI >= 0 && matchmer.mer == merle.mer){
00214                 cur_matchI--;
00215                 matchmer = (*this)[cur_matchI];
00216         }
00217         int64 first_matchI = cur_matchI+1;
00218 
00219         //now seek forwards
00220         cur_matchI = matchI+1;
00221         matchmer = (*this)[cur_matchI];
00222         while(cur_matchI < GNSEQI_END && matchmer.mer == merle.mer){
00223                 cur_matchI++;
00224                 matchmer = (*this)[cur_matchI];
00225         }
00226         //fill the result array
00227         for(matchI = first_matchI; matchI < cur_matchI; matchI++)
00228                 result.push_back(matchI);
00229 }
00230 
00231 string SortedMerList::Description() const{
00232         return header.description;
00233 }
00234 
00235 void SortedMerList::SetDescription(const string& d){
00236         strncpy(header.description, d.c_str(), DESCRIPTION_SIZE-1);
00237 }
00238 
00239 uint SortedMerList::SeedLength() const{
00240         return header.seed_length;
00241 }
00245 uint SortedMerList::SeedWeight() const{
00246         return header.seed_weight;
00247 }
00251 uint64 SortedMerList::Seed() const{
00252         return header.seed;
00253 }
00254 
00255 boolean SortedMerList::IsCircular() const{
00256         return header.circular;
00257 }
00258 
00259 uint64 SortedMerList::GetMerMask() const{
00260         return mer_mask;
00261 }
00262 
00263 uint64 SortedMerList::GetSeedMask() const{
00264         return seed_mask;
00265 }
00266 
00267 uint32 SortedMerList::GetMerMaskSize() const{
00268         return mask_size;
00269 }
00270 
00271 void SortedMerList::SetMerMaskSize(uint32 mer_size){
00272         if(mer_size > header.seed_length)
00273                 mask_size = header.seed_length;
00274         else
00275                 mask_size = mer_size;
00276 
00277         // calculate the mer mask
00278         mer_mask = UINT32_MAX;
00279         mer_mask <<= 32;
00280         mer_mask |= UINT32_MAX;
00281         mer_mask <<= (64 - header.alphabet_bits * mer_size);
00282 }
00283 
00284 gnSeqI SortedMerList::Length() const{
00285         return header.length;
00286 }
00287 
00288 gnSeqI SortedMerList::SMLLength() const{
00289         // make sure there was at least one seed
00290         if( header.length < header.seed_length )
00291                 return 0;
00292         if( !header.circular )
00293                 return header.length - header.seed_length + 1;
00294         return header.length;
00295 }
00296 
00297 sarID_t SortedMerList::GetID() const{
00298         return header.id;
00299 }
00300 void SortedMerList::SetID(const sarID_t d){
00301         header.id = d;
00302 }
00303 
00304 #define OPT_HEADER_ALPHABET_BITS DNA_ALPHA_BITS
00305 
00306 void SortedMerList::SetSequence(gnSeqC* seq_buf, gnSeqI seq_len){
00307         binary_seq_len = (seq_len * header.alphabet_bits) / 32;
00308         if((seq_len * header.alphabet_bits) % 32 != 0)
00309                 binary_seq_len++;
00310 
00311         binary_seq_len+=2;      // zero-pad the end for extra working room
00312 
00313         if( sequence != NULL )
00314                 delete[] sequence;
00315         sequence = new uint32[binary_seq_len];
00316         translate32(sequence, seq_buf, seq_len);
00317 }
00318 
00319 // this should return a mer containing all characters covered by the
00320 // spaced seed
00321 uint64 SortedMerList::GetMer(gnSeqI position) const
00322 {
00323         //check this for access violations.
00324         uint64 mer_a;
00325         gnSeqI mer_word, mer_bit;
00326         uint32 merle;
00327         //get mer_a
00328         mer_a = 0;
00329         mer_word = (position * (gnSeqI)OPT_HEADER_ALPHABET_BITS) / (gnSeqI)32;
00330         mer_bit = (position * (gnSeqI)OPT_HEADER_ALPHABET_BITS) % (gnSeqI)32;
00331         mer_a |= sequence[mer_word++];
00332         mer_a <<= 32;
00333         mer_a |= sequence[mer_word++];
00334         if(mer_bit > 0){
00335                 merle = sequence[mer_word];
00336                 merle >>= 32 - mer_bit;
00337                 mer_a <<= mer_bit;
00338                 mer_a |= merle;
00339         }
00340         mer_a &= mer_mask;
00341         return mer_a;
00342 }
00343 
00344 //potential buffer overflows here.  make dest extra big.
00345 void SortedMerList::GetBSequence(uint32* dest, const gnSeqI len, const gnSeqI offset){
00346         //first determine the byte offset of the sequence within the file.
00347         if(offset >= header.length){
00348                 Throw_gnEx( IndexOutOfBounds() );
00349         }
00350         uint64 startpos = (offset * OPT_HEADER_ALPHABET_BITS) / 32;
00351         int begin_remainder = (offset * OPT_HEADER_ALPHABET_BITS) % 32;
00352         uint64 readlen = offset + len < header.length ? len : header.length - offset;
00353 
00354         gnSeqI word_read_len = (readlen * OPT_HEADER_ALPHABET_BITS) / 32;
00355         int end_remainder = (readlen * OPT_HEADER_ALPHABET_BITS) % 32;
00356         if(begin_remainder + (readlen * OPT_HEADER_ALPHABET_BITS) > 32
00357            && end_remainder > 0)
00358                 word_read_len++;
00359         if(begin_remainder > 0)
00360                 word_read_len++;
00361         
00362         //now do the actual read
00363         memcpy((char*)dest, (char*)sequence + (startpos * 4), word_read_len * 4);
00364         
00365         //now shift if needed
00366         ShiftWords(dest, word_read_len, -begin_remainder);
00367         
00368         //now mask if needed
00369         if(end_remainder > begin_remainder){
00370                 uint32 mask = 0xFFFFFFFF;
00371                 mask <<= 32 - (end_remainder - begin_remainder);
00372                 dest[word_read_len-1] &= mask;
00373         }else if(end_remainder < begin_remainder){
00374                 uint32 mask = 0xFFFFFFFF;
00375                 mask <<= (begin_remainder - end_remainder);
00376                 dest[word_read_len-2] &= mask;
00377         }
00378 }
00379 
00380 gnSeqI SortedMerList::bsearch(const struct bmer& query_mer, const gnSeqI start, const gnSeqI end) {
00381 
00382         gnSeqI middle = (start + end) / 2;
00383         struct bmer midmer = (*this)[middle];
00384         if(midmer.mer == query_mer.mer)
00385                 return middle;
00386         else if((midmer.mer < query_mer.mer) && (middle < end))
00387                 return bsearch(query_mer, middle + 1, end);
00388         else if((midmer.mer > query_mer.mer) && (start < middle))
00389                 return bsearch(query_mer, start, middle - 1);
00390         
00391         //if we get here then the mer was not found.
00392         //return where it would be if it existed.
00393         return middle;
00394 }
00395 
00396 //translate the character sequence to binary form based on the
00397 //translation table.
00398 void SortedMerList::translate(uint8* dest, const gnSeqC* src, const gnSeqI len) const{
00399         uint8 start_bit = 0;
00400         gnSeqI cur_byte = 0;
00401         const uint32 alpha_bits = OPT_HEADER_ALPHABET_BITS;
00402         dest[cur_byte] = 0;
00403         for(uint32 i=0; i < len; i++){
00404                 uint8 tmp = header.translation_table[src[i]];
00405                 if(start_bit + alpha_bits <= 8){
00406                         tmp <<= 8 - start_bit - alpha_bits;
00407                         dest[cur_byte] |= tmp;
00408                 }else{
00409                         uint8 over_bits = (start_bit + alpha_bits) % 8;
00410                         uint8 tmp2 = tmp;
00411                         tmp2 <<= 8 - over_bits;
00412                         tmp >>= over_bits;
00413                         dest[cur_byte] |= tmp;
00414                         dest[cur_byte+1] |= tmp2;
00415                 }
00416                 start_bit += alpha_bits;
00417                 if(start_bit >= 8){
00418                         start_bit %= 8;
00419                         cur_byte++;
00420                         dest[cur_byte] = 0;
00421                 }
00422         }
00423 }
00424 
00425 void SortedMerList::translate32(uint32* dest, const gnSeqC* src, const gnSeqI len) const{
00426         if( len == 0 )
00427                 return;
00428         uint8 start_bit = 0;
00429         gnSeqI cur_word = 0;
00430         const uint32 alpha_bits = OPT_HEADER_ALPHABET_BITS;
00431         dest[cur_word] = 0;
00432         for(uint32 i=0; i < len; i++){
00433                 uint32 tmp = header.translation_table[src[i]];
00434                 if(start_bit + alpha_bits <= 32){
00435                         tmp <<= 32 - start_bit - alpha_bits;
00436                         dest[cur_word] |= tmp;
00437                         start_bit += alpha_bits;
00438                         if(start_bit >= 32 && i < len - 1){
00439                                 start_bit %= 32;
00440                                 cur_word++;
00441                                 dest[cur_word] = 0;
00442                         }
00443                 }else{
00444                         uint8 over_bits = (start_bit + alpha_bits) % 32;
00445                         uint32 tmp2 = tmp;
00446                         tmp2 <<= 32 - over_bits;
00447                         tmp >>= over_bits;
00448                         dest[cur_word] |= tmp;
00449                         cur_word++;
00450                         dest[cur_word] = 0;
00451                         dest[cur_word] |= tmp2;
00452                         start_bit = over_bits;
00453                 }
00454         }
00455 }
00456 SMLHeader SortedMerList::GetHeader() const{
00457         return header;
00458 }
00459 
00460 gnSeqI SortedMerList::UniqueMerCount(){
00461         if(header.unique_mers != NO_UNIQUE_COUNT)
00462                 return header.unique_mers;
00463 
00464         uint32 MER_BUFFER_SIZE = 16384;  //not quite arbitrary (2^14)
00465         gnSeqI cur_pos = 0;
00466         vector<bmer> mer_vector;
00467         bmer prev_mer;
00468         gnSeqI m_unique = 0;
00469         gnSeqI report_interval = MER_BUFFER_SIZE * 212;
00470         while(cur_pos < header.length){
00471                 if(!Read(mer_vector, MER_BUFFER_SIZE, cur_pos)){
00472                         break;
00473 //                      DebugMsg("SortedMerList::UniqueMerCount: Error reading bmer vector.");
00474 //                      return NO_UNIQUE_COUNT;
00475                 }
00476                 uint32 mer_count = mer_vector.size();
00477                 if(mer_count == 0)
00478                         break;
00479                 if(cur_pos > 0 && prev_mer.mer != mer_vector[0].mer)
00480                         m_unique++;
00481                 
00482                 //count them up.
00483                 uint32 i = 0;
00484                 for(uint32 j = 1; j < mer_count; j++){
00485                         if((mer_vector[i].mer & mer_mask) != (mer_vector[j].mer & mer_mask) )
00486                                 m_unique++;
00487                         i++;
00488                 }
00489                 prev_mer = mer_vector[i];
00490                 cur_pos += mer_count;
00491                 if( cur_pos % report_interval == 0 ){
00492 //                      cout << "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
00493                         cout << m_unique << "/" << cur_pos << endl;
00494                 }
00495         }
00496         cout << endl;
00497         m_unique++;
00498         header.unique_mers = m_unique;
00499         return header.unique_mers;
00500 }
00501 
00502 //will not handle more than 8GB sequence on 32-bit systems
00503 void SortedMerList::ShiftWords(unsigned int* data, uint32 length, int32 bits)
00504 {
00505         int32 word_bits = 8 * sizeof(unsigned int);
00506         if(bits > 0 && bits < word_bits){
00507                 //shift everything right starting at the end
00508                 data[length - 1] >>= bits;
00509                 for(int i=length-2; i >= 0; i--){
00510                         uint32 tmp = data[i];
00511                         tmp <<= word_bits - bits;
00512                         data[i+1] |= tmp;
00513                         data[i] >>= bits;
00514                 }
00515         }else if(bits < 0 && bits > (-1)*word_bits){
00516                 bits *= -1;
00517                 //shift everything left
00518                 data[0] <<= bits;
00519                 for(uint32 i=0; i < length; i++){
00520                         uint32 tmp = data[i+1];
00521                         tmp >>= word_bits - bits;
00522                         data[i] |= tmp;
00523                         data[i+1] <<= bits;
00524                 }
00525         }
00526 }
00527 
00528 void SortedMerList::FillSML(gnSeqC* seq_buf, gnSeqI seq_len, boolean circular, vector<bmer>& sml_array){
00529         const uint32 alpha_bits = OPT_HEADER_ALPHABET_BITS;
00530         const uint32 mer_size = header.seed_length;
00531         gnSeqI sar_len = seq_len;
00532         if(!circular)
00533                 sar_len -= header.seed_length - 1;
00534         sml_array.reserve(sar_len);
00535 
00536         bmer cur_suffix;
00537         cur_suffix.mer = 0;
00538         cur_suffix.position = 0;
00539 
00540         /* now fill in the suffix array with the forward sequence*/
00541         for(gnSeqI i=0; i < mer_size; i++){
00542                 cur_suffix.mer <<= alpha_bits;
00543                 cur_suffix.mer |= header.translation_table[seq_buf[i]];
00544         }
00545         uint8 dead_bits = 64 - (mer_size * alpha_bits);
00546         cur_suffix.mer <<= dead_bits;
00547 
00548         sml_array.push_back(cur_suffix);
00549 
00550         //fill sml_array with mers
00551         for(gnSeqI seqI = 1; seqI < sar_len; seqI++){//already added the
00552                                                                                                         //first one
00553                 cur_suffix.position++;
00554                 cur_suffix.mer <<= alpha_bits;
00555                 uint64 new_mer = header.translation_table[seq_buf[seqI+(mer_size-1)]];
00556                 new_mer <<= dead_bits;
00557                 cur_suffix.mer |= new_mer;
00558                 sml_array.push_back(cur_suffix);
00559         }
00560 }
00561 
00562 void SortedMerList::FillSML(const gnSequence& seq, vector<bmer>& sml_array){
00563         gnSeqI seq_len = seq.length();
00564         Array<gnSeqC> seq_buf( seq_len );
00565         seq.ToArray(seq_buf.data, seq_len);
00566         FillSML(seq_buf.data, seq_len, seq.isCircular(), sml_array);
00567 }
00568 
00569 void SortedMerList::FillSML(gnSeqI seq_len, vector<gnSeqI>& pos_array){
00570         pos_array.clear();
00571         pos_array.reserve( seq_len );
00572         for(gnSeqI seqI = 0; seqI < seq_len; seqI++ )
00573                 pos_array.push_back(seqI);
00574 }
00575 
00576 uint64 SortedMerList::GetDnaMer(gnSeqI offset) const
00577 {
00578         // get the forward orientation mer
00579         uint64 mer_a = SortedMerList::GetMer( offset );
00580         //find the reverse complement of mer_a and return it if it's
00581         //smaller
00582         uint64 mer_c = RevCompMer( mer_a, header.seed_length ); //mer_c will be the reverse complement
00583         
00584         // for debugging
00585 //      if( mer_c < mer_a )
00586 //              return mer_c;
00587         return mer_a < mer_c ? mer_a : mer_c;
00588 }
00589 
00590 #define OPT_ALPHA_MASQ 0x00000003
00591 
00592 uint64 SortedMerList::RevCompMer( uint64 mer_a, int mer_length ) const
00593 {
00594         //find the reverse complement of mer_a and return it if it's
00595         //smaller
00596         uint64 mer_b, mer_c = 0;        //mer_c will be the reverse complement
00597         mer_b = ~mer_a;
00598 //      uint32 masq = 0xffffffff;
00599 //      masq >>= 32 - header.alphabet_bits;
00600         for(uint32 i = 0; i < 64; i += OPT_HEADER_ALPHABET_BITS){
00601                 mer_c |= mer_b & OPT_ALPHA_MASQ;
00602 //              mer_c |= mer_b & masq;
00603                 mer_b >>= OPT_HEADER_ALPHABET_BITS;
00604                 mer_c <<= OPT_HEADER_ALPHABET_BITS;
00605         }
00606         mer_c <<= 64 - (OPT_HEADER_ALPHABET_BITS * (mer_length+1));
00607         mer_c |= 1;
00608         return mer_c;
00609 }
00610 
00611 
00612 void SortedMerList::FillDnaSML(const gnSequence& seq, vector<bmer>& sml_array){
00613         /* now fill in the suffix array with the forward sequence*/
00614         const uint32 alpha_bits = OPT_HEADER_ALPHABET_BITS;
00615         const uint32 mer_size = header.seed_length;
00616         gnSeqI sar_len = seq.length();
00617         if( sar_len < header.seed_length )
00618                 return; // can't have an sml if there ain't enough sequence
00619         if( !seq.isCircular() )
00620                 sar_len -= ( header.seed_length - 1);
00621         sml_array.reserve(sar_len);
00622 
00623         uint32 dead_bits = 64 - (mer_size * alpha_bits);
00624         uint64 create_mask = UINT32_MAX;
00625         create_mask <<= 32;
00626         create_mask |= UINT32_MAX;
00627         create_mask <<= dead_bits;
00628 
00629         bmer cur_suffix, rcur_suffix;
00630         cur_suffix.mer = sequence[0];
00631         cur_suffix.mer <<= 32;
00632         cur_suffix.mer |= sequence[1];
00633         cur_suffix.mer &= create_mask;
00634         cur_suffix.position = 0;
00635         rcur_suffix.mer = 0;
00636         rcur_suffix.position = 0;
00637         
00638         //find the reverse complement of cur_suffix.mer and return it if it's
00639         //smaller
00640         uint64 mer_b = 0;
00641         mer_b = ~cur_suffix.mer;
00642 //      uint32 masq = 0xffffffff;
00643 //      masq >>= 32 - alpha_bits;
00644         for(uint32 i = 0; i < 64; i += alpha_bits){
00645 //              rcur_suffix.mer |= mer_b & masq;
00646                 rcur_suffix.mer |= mer_b & OPT_ALPHA_MASQ;
00647                 mer_b >>= alpha_bits;
00648                 rcur_suffix.mer <<= alpha_bits;
00649         }
00650         rcur_suffix.mer <<= dead_bits - alpha_bits;
00651         rcur_suffix.mer |= 1;
00652 
00653         //add the first mer
00654         if(cur_suffix.mer < rcur_suffix.mer)
00655                 sml_array.push_back(cur_suffix);
00656         else
00657                 sml_array.push_back(rcur_suffix);
00658 
00659         //fill sml_array with mers
00660         gnSeqI  endI = sar_len + mer_size;
00661         if(seq.isCircular())
00662                 endI += mer_size;
00663 
00664         uint32 rdead_bits = 64 - alpha_bits - dead_bits;
00665         uint64 tmp_rseq = 0;
00666         uint32 seqI = (mer_size * alpha_bits) / 32;
00667         int32 cur_bit = 32 - alpha_bits - ((mer_size * alpha_bits) % 32);
00668         uint32 cur_seq = sequence[seqI];
00669         uint64 tmp_seq;
00670 //      uint32 alpha_mask = 0xFFFFFFFF;
00671 //      alpha_mask >>= 32 - alpha_bits;
00672         uint64 revalpha_mask = OPT_ALPHA_MASQ;
00673         revalpha_mask <<= dead_bits;
00674 
00675         //which is slower? a memory operation or a conditional?
00676         //probably a memory operation.
00677         for(gnSeqI cur_pos = mer_size + 1; cur_pos < endI; cur_pos++){//already added the
00678                                                                                                         //first one
00679                 //increment positions
00680                 cur_suffix.position++;
00681                 rcur_suffix.position++;
00682                 //extract the next character
00683                 tmp_seq = cur_seq;
00684                 tmp_seq >>= cur_bit;
00685                 tmp_seq &= OPT_ALPHA_MASQ;
00686                 tmp_seq <<= dead_bits;
00687                 
00688                 //add it to the forward mer
00689                 cur_suffix.mer <<= alpha_bits;
00690                 cur_suffix.mer |= tmp_seq;
00691 
00692                 //do the reverse complement mer
00693                 tmp_seq = ~tmp_seq;
00694                 tmp_seq &= revalpha_mask;
00695                 tmp_rseq = tmp_seq;
00696                 tmp_rseq <<= rdead_bits;
00697                 rcur_suffix.mer >>= alpha_bits;
00698                 rcur_suffix.mer |= tmp_rseq;
00699                 rcur_suffix.mer &= create_mask;
00700                 rcur_suffix.mer |= 1;
00701                 if(cur_suffix.mer < rcur_suffix.mer)
00702                         sml_array.push_back(cur_suffix);
00703                 else
00704                         sml_array.push_back(rcur_suffix);
00705 
00706                 cur_bit -= alpha_bits;
00707                 if(cur_bit < 0){
00708                         cur_bit += alpha_bits;
00709                         cur_seq <<= 16;         //trade bitwise ops for conditional
00710                         cur_seq <<= 16 - (cur_bit);
00711                         seqI++;
00712                         tmp_seq = sequence[seqI];
00713                         tmp_seq >>= cur_bit;
00714                         cur_seq |= tmp_seq;
00715                         cur_bit += 32 - alpha_bits;
00716                 }
00717         }
00718 }
00719 
00720 
00721 uint64 SortedMerList::GetSeedMer( gnSeqI offset ) const
00722 {
00723         //check this for access violations.
00724         uint64 mer_a = SortedMerList::GetMer( offset );
00725         uint64 mer_b = SortedMerList::GetMer( offset + 1 );
00726         uint64 seed_mer = 0;
00727         uint64 alpha_mask = 1;
00728         alpha_mask <<= OPT_HEADER_ALPHABET_BITS;
00729         alpha_mask--;
00730         alpha_mask <<= 62;
00731         uint64 cur_alpha_mask = alpha_mask;
00732         uint64 char_mask = 1;
00733         char_mask <<= header.seed_length - 1;
00734         uint64 cur_mer = mer_a;
00735         const int mer_transition = 64 / OPT_HEADER_ALPHABET_BITS;
00736         int patternI = 0;
00737         int rshift_amt = 64 - OPT_HEADER_ALPHABET_BITS;
00738         for( ; patternI < header.seed_length; patternI++ ){
00739                 if( patternI == mer_transition ){
00740                         cur_mer = mer_b;
00741                         cur_alpha_mask = alpha_mask;
00742                         rshift_amt = 64 - OPT_HEADER_ALPHABET_BITS;
00743                 }
00744                 if( (header.seed & char_mask) != 0 ){
00745                         uint64 char_tmp = cur_mer & cur_alpha_mask;
00746                         char_tmp >>= rshift_amt;
00747                         seed_mer <<= OPT_HEADER_ALPHABET_BITS;
00748                         seed_mer |= char_tmp;
00749                 }
00750                 cur_alpha_mask >>= OPT_HEADER_ALPHABET_BITS;
00751                 char_mask >>= 1;
00752                 rshift_amt -= OPT_HEADER_ALPHABET_BITS;
00753         }
00754 
00755         seed_mer <<= 64 - (OPT_HEADER_ALPHABET_BITS * header.seed_weight);
00756         return seed_mer;
00757 }
00758 
00759 uint64 SortedMerList::GetDnaSeedMer( gnSeqI offset ) const
00760 {
00761         uint64 seed_mer = SortedMerList::GetSeedMer( offset );
00762         uint64 rev_mer = RevCompMer( seed_mer, header.seed_weight );
00763         return seed_mer < rev_mer ? seed_mer : rev_mer;
00764 }
00765 
00766 void SortedMerList::FillDnaSeedSML(const gnSequence& seq, vector<bmer>& sml_array){
00767         // first get the length of the sequence
00768         gnSeqI sar_len = SMLLength();
00769         if( sar_len < header.seed_length )
00770                 return; // can't have an sml if there ain't enough sequence
00771         sml_array.resize(sar_len);
00772         
00773         /* now fill in the sml_array with the forward sequence */
00774         for( gnSeqI seedI = 0; seedI < sar_len; seedI++ ){
00775                 sml_array[seedI].mer = GetDnaSeedMer( seedI );
00776                 sml_array[seedI].position = seedI;
00777         }
00778 }
00779 
00780 
00781 void SortedMerList::Create(const gnSequence& seq, const uint64 seed){
00782         
00783         if(CalculateMaxMerSize() == 0)
00784                 Throw_gnExMsg( SMLCreateError(), "Alphabet size is too large" );
00785 
00786         int seed_length = getSeedLength( seed );
00787         int seed_weight = getSeedWeight( seed );
00788         
00789         if(seed_length > CalculateMaxMerSize())
00790                 Throw_gnExMsg( SMLCreateError(), "Mer size is too large" );
00791 
00792         if(seed_length == 0)
00793                 Throw_gnExMsg( SMLCreateError(), "Can't have 0 seed length" );
00794 
00795         //determine sequence and sar length and read in sequence
00796         gnSeqI seq_len = seq.length();
00797         if(!seq.isCircular()){
00798                 header.circular = false;
00799         }else
00800                 header.circular = true;
00801         // use the nifty Array class as a wrapper for the buffer to ensure correct deallocation
00802         gnSeqI buf_len = seq.isCircular() ? seq_len + seed_length : seq_len;
00803         Array<gnSeqC> seq_buf( buf_len );
00804         seq.ToArray(seq_buf.data, seq_len);
00805         if( seq.isCircular() )
00806                 seq.ToArray(seq_buf.data + seq_len, seed_length-1);
00807 
00808         // set header information
00809         header.length = seq_len;
00810         header.seed_length = seed_length;
00811         header.seed_weight = seed_weight;
00812         header.seed = seed;
00813 
00814         SetMerMaskSize( seed_weight );
00815         seed_mask = mer_mask;
00816         SetMerMaskSize( seed_length );
00817 
00818         SetSequence( seq_buf.data, buf_len );
00819 }
00820 
00821 } // namespace mems

Generated on Fri Mar 14 06:01:04 2008 for libMems by doxygen 1.3.6