25#include "SamValidation.h"
27#include "BaseUtilities.h"
28#include "SamQuerySeqWithRefHelper.h"
30const char* SamRecord::DEFAULT_READ_NAME =
"UNKNOWN";
31const char* SamRecord::FIELD_ABSENT_STRING =
"=";
32int SamRecord::myNumWarns = 0;
37 mySequenceTranslation(NONE)
39 int32_t defaultAllocSize = DEFAULT_BLOCK_SIZE +
sizeof(int32_t);
44 myCigarTempBuffer = NULL;
45 myCigarTempBufferAllocatedSize = 0;
47 allocatedSize = defaultAllocSize;
54 : myStatus(errorHandlingType),
56 mySequenceTranslation(NONE)
58 int32_t defaultAllocSize = DEFAULT_BLOCK_SIZE +
sizeof(int32_t);
63 myCigarTempBuffer = NULL;
64 myCigarTempBufferAllocatedSize = 0;
66 allocatedSize = defaultAllocSize;
76 if(myRecordPtr != NULL)
81 if(myCigarTempBuffer != NULL)
83 free(myCigarTempBuffer);
84 myCigarTempBuffer = NULL;
85 myCigarTempBufferAllocatedSize = 0;
93 myIsBufferSynced =
true;
95 myRecordPtr->myBlockSize = DEFAULT_BLOCK_SIZE;
96 myRecordPtr->myReferenceID = -1;
97 myRecordPtr->myPosition = -1;
98 myRecordPtr->myReadNameLength = DEFAULT_READ_NAME_LENGTH;
99 myRecordPtr->myMapQuality = 0;
100 myRecordPtr->myBin = DEFAULT_BIN;
101 myRecordPtr->myCigarLength = 0;
102 myRecordPtr->myFlag = 0;
103 myRecordPtr->myReadLength = 0;
104 myRecordPtr->myMateReferenceID = -1;
105 myRecordPtr->myMatePosition = -1;
106 myRecordPtr->myInsertSize = 0;
111 myReadName = DEFAULT_READ_NAME;
112 myReferenceName =
"*";
113 myMateReferenceName =
"*";
117 mySeqWithoutEq.clear();
119 myNeedToSetTagsFromBuffer =
false;
120 myNeedToSetTagsInBuffer =
false;
123 myAlignmentLength = -1;
124 myUnclippedStartOffset = -1;
125 myUnclippedEndOffset = -1;
133 memcpy(&(myRecordPtr->myData), myReadName.c_str(),
134 myRecordPtr->myReadNameLength);
137 myIsReadNameBufferValid =
true;
138 myIsCigarBufferValid =
true;
140 (
unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength +
141 myRecordPtr->myCigarLength *
sizeof(
int);
142 myIsSequenceBufferValid =
true;
143 myBufferSequenceTranslation =
NONE;
145 myPackedQuality = myPackedSequence;
146 myIsQualityBufferValid =
true;
147 myIsTagsBufferValid =
true;
150 myCigarTempBufferLength = -1;
154 NOT_FOUND_TAG_STRING =
"";
155 NOT_FOUND_TAG_INT = -1;
168 std::string errorMessage =
"";
180 myRefPtr = reference;
189 mySequenceTranslation = translation;
195 myReadName = readName;
196 myIsBufferSynced =
false;
197 myIsReadNameBufferValid =
false;
202 if(myReadName.Length() == 0)
205 myReadName = DEFAULT_READ_NAME;
206 myRecordPtr->myReadNameLength = DEFAULT_READ_NAME_LENGTH;
218 myRecordPtr->myFlag = flag;
224 const char* referenceName)
228 myReferenceName = referenceName;
230 myRecordPtr->myReferenceID = header.
getReferenceID(referenceName,
true);
245 myRecordPtr->myPosition = position;
246 myIsBinValid =
false;
254 myRecordPtr->myMapQuality = mapQuality;
264 myIsBufferSynced =
false;
265 myIsCigarBufferValid =
false;
266 myCigarTempBufferLength = -1;
267 myIsBinValid =
false;
270 myAlignmentLength = -1;
271 myUnclippedStartOffset = -1;
272 myUnclippedEndOffset = -1;
283 myIsBufferSynced =
false;
284 myIsCigarBufferValid =
false;
285 myCigarTempBufferLength = -1;
286 myIsBinValid =
false;
289 myAlignmentLength = -1;
290 myUnclippedStartOffset = -1;
291 myUnclippedEndOffset = -1;
298 const char* mateReferenceName)
304 if(strcmp(mateReferenceName, FIELD_ABSENT_STRING) == 0)
306 myMateReferenceName = myReferenceName;
310 myMateReferenceName = mateReferenceName;
315 myRecordPtr->myMateReferenceID =
331 myRecordPtr->myMatePosition = matePosition;
339 myRecordPtr->myInsertSize = insertSize;
349 mySeqWithoutEq.clear();
351 myIsBufferSynced =
false;
352 myIsSequenceBufferValid =
false;
361 myIsBufferSynced =
false;
362 myIsQualityBufferValid =
false;
374 if(myAlignmentLength == -1)
381 bool shifted =
false;
385 uint32_t currentPos = 0;
392 currentPos += myCigarRoller[0].count;
395 int numOps = myCigarRoller.
size();
399 for(
int currentOp = 1; currentOp < numOps; currentOp++)
404 int prevOpIndex = currentOp-1;
407 int nextOpIndex = currentOp+1;
408 if(nextOpIndex == numOps)
411 nextOpIndex = currentOp;
415 uint32_t prevOpStart =
416 currentPos - myCigarRoller[prevOpIndex].count;
423 currentPos += myCigarRoller[currentOp].count;
433 uint32_t insertEndPos =
434 currentPos + myCigarRoller[currentOp].count - 1;
437 uint32_t insertStartPos = currentPos;
446 while((insertStartPos > prevOpStart) &&
456 int shiftLen = currentPos - insertStartPos;
467 if(myCigarRoller[nextOpIndex].operation ==
468 myCigarRoller[prevOpIndex].operation)
477 if(myCigarRoller[prevOpIndex].count == 0)
479 myCigarRoller.
Remove(prevOpIndex);
488 if(insertStartPos == prevOpStart)
492 myCigarRoller.
Update(currentOp,
493 myCigarRoller[prevOpIndex].operation,
494 myCigarRoller[prevOpIndex].count);
497 myCigarRoller.
Update(prevOpIndex,
505 currentPos += myCigarRoller[currentOp].count;
510 currentPos += myCigarRoller[currentOp].count;
526 uint32_t fromBufferSize,
530 if((fromBuffer == NULL) || (fromBufferSize == 0))
534 "Cannot parse an empty file.");
542 if(!allocateRecordStructure(fromBufferSize))
548 memcpy(myRecordPtr, fromBuffer, fromBufferSize);
550 setVariablesForNewBuffer(header);
562 if((filePtr == NULL) || (filePtr->
isOpen() ==
false))
566 "Can't read from an unopened file.");
575 ifread(filePtr, &(myRecordPtr->myBlockSize),
sizeof(int32_t));
578 if(
ifeof(filePtr) && (numBytes == 0))
581 std::string statusMsg =
"No more records left to read, ";
589 if(numBytes !=
sizeof(int32_t))
597 std::string statusMsg =
"EOF reached in the middle of a record, ";
607 std::string statusMsg =
"Failed to read the record size, ";
617 if(!allocateRecordStructure(myRecordPtr->myBlockSize +
sizeof(int32_t)))
625 if(
ifread(filePtr, &(myRecordPtr->myReferenceID), myRecordPtr->myBlockSize)
626 != (
unsigned int)myRecordPtr->myBlockSize)
630 std::string statusMsg =
"Failed to read the record, ";
638 setVariablesForNewBuffer(header);
654 int tagBufferSize = 0;
657 if(myNeedToSetTagsFromBuffer)
659 if(!setTagsFromBuffer())
674 if(value > ((std::numeric_limits<char>::min)()))
680 else if(value > ((std::numeric_limits<short>::min)()))
696 if(value < ((std::numeric_limits<unsigned char>::max)()))
702 else if(value < ((std::numeric_limits<unsigned short>::max)()))
717 key = MAKEKEY(tag[0], tag[1], bamvtype);
718 unsigned int hashIndex = extras.Find(key);
719 if(hashIndex != LH_NOTFOUND)
722 index = extras[hashIndex];
726 switch(intType[index])
743 "unknown tag inttype type found.\n");
749 if(myNumWarns++ < myMaxWarns)
753 appendIntArrayValue(index, origVal);
754 appendIntArrayValue(bamvtype, value, newVal);
755 fprintf(stderr,
"WARNING: Duplicate Tags, overwritting %c%c:%c:%s with %c%c:%c:%s\n",
756 tag[0], tag[1], intType[index], origVal.c_str(), tag[0], tag[1], bamvtype, newVal.c_str());
757 if(myNumWarns == myMaxWarns)
759 fprintf(stderr,
"Suppressing rest of Duplicate Tag warnings.\n");
764 integers[index] = value;
765 intType[index] = bamvtype;
770 index = integers.Length();
772 integers.Push(value);
773 intType.push_back(bamvtype);
775 extras.Add(key, index);
779 myNeedToSetTagsInBuffer =
true;
780 myIsTagsBufferValid =
false;
781 myIsBufferSynced =
false;
782 myTagBufferSize += tagBufferSize;
796 int intVal = atoi(valuePtr);
806 int tagBufferSize = 0;
809 if(myNeedToSetTagsFromBuffer)
811 if(!setTagsFromBuffer())
819 key = MAKEKEY(tag[0], tag[1], vtype);
820 unsigned int hashIndex = extras.Find(key);
821 if(hashIndex != LH_NOTFOUND)
824 index = extras[hashIndex];
827 char origType = vtype;
834 if((integers[index] == (
const int)*(valuePtr)) &&
835 (intType[index] == vtype))
843 origType = intType[index];
844 appendIntArrayValue(index, origTag);
845 tagBufferSize -= getNumericTagTypeSize(intType[index]);
846 tagBufferSize += getNumericTagTypeSize(vtype);
847 integers[index] = (
const int)*(valuePtr);
848 intType[index] = vtype;
853 if(strings[index] == valuePtr)
861 origTag = strings[index];
862 tagBufferSize -= strings[index].Length();
863 strings[index] = valuePtr;
865 tagBufferSize += strings[index].Length();
870 if(strings[index] == valuePtr)
878 origTag = strings[index];
879 tagBufferSize -= getBtagBufferSize(strings[index]);
880 strings[index] = valuePtr;
882 tagBufferSize += getBtagBufferSize(strings[index]);
887 if(floats[index] == (
float)atof(valuePtr))
895 origTag.appendFullFloat(floats[index]);
896 floats[index] = (float)atof(valuePtr);
901 "samRecord::addTag() - Unknown custom field of type %c\n",
904 "Unknown custom field in a tag");
912 if(myNumWarns++ < myMaxWarns)
914 fprintf(stderr,
"WARNING: Duplicate Tags, overwritting %c%c:%c:%s with %c%c:%c:%s\n",
915 tag[0], tag[1], origType, origTag.c_str(), tag[0], tag[1], vtype, valuePtr);
916 if(myNumWarns == myMaxWarns)
918 fprintf(stderr,
"Suppressing rest of Duplicate Tag warnings.\n");
928 index = integers.Length();
929 integers.Push((
const int)*(valuePtr));
930 intType.push_back(vtype);
934 index = strings.Length();
935 strings.Push(valuePtr);
936 tagBufferSize += 4 + strings.Last().Length();
939 index = strings.Length();
940 strings.Push(valuePtr);
941 tagBufferSize += 3 + getBtagBufferSize(strings[index]);
944 index = floats.size();
945 floats.push_back((
float)atof(valuePtr));
950 "samRecord::addTag() - Unknown custom field of type %c\n",
953 "Unknown custom field in a tag");
960 extras.Add(key, index);
968 myNeedToSetTagsInBuffer =
true;
969 myIsTagsBufferValid =
false;
970 myIsBufferSynced =
false;
971 myTagBufferSize += tagBufferSize;
979 if(extras.Entries() != 0)
999 "rmTag called with tag that is not 2 characters\n");
1004 if(myNeedToSetTagsFromBuffer)
1006 if(!setTagsFromBuffer())
1015 int key = MAKEKEY(tag[0], tag[1], type);
1017 int offset = extras.Find(key);
1029 getTypeFromKey(key, vtype);
1032 vtype = getIntegerType(offset);
1059 rmBuffSize = 4 +
getString(offset).Length();
1062 rmBuffSize = 3 + getBtagBufferSize(
getString(offset));
1066 "rmTag called with unknown type.\n");
1072 myNeedToSetTagsInBuffer =
true;
1073 myIsTagsBufferValid =
false;
1074 myIsBufferSynced =
false;
1075 myTagBufferSize -= rmBuffSize;
1078 extras.Delete(offset);
1085 const char* currentTagPtr = tags;
1088 if(myNeedToSetTagsFromBuffer)
1090 if(!setTagsFromBuffer())
1098 bool returnStatus =
true;
1101 while(*currentTagPtr !=
'\0')
1107 if((currentTagPtr[0] ==
'\0') || (currentTagPtr[1] ==
'\0') ||
1108 (currentTagPtr[2] !=
':') || (currentTagPtr[3] ==
'\0'))
1111 "rmTags called with improperly formatted tags.\n");
1112 returnStatus =
false;
1117 int key = MAKEKEY(currentTagPtr[0], currentTagPtr[1],
1120 int offset = extras.Find(key);
1127 getTypeFromKey(key, vtype);
1130 vtype = getIntegerType(offset);
1156 rmBuffSize += 4 +
getString(offset).Length();
1159 rmBuffSize += 3 + getBtagBufferSize(
getString(offset));
1163 "rmTag called with unknown type.\n");
1164 returnStatus =
false;
1169 extras.Delete(offset);
1172 if((currentTagPtr[4] ==
';') || (currentTagPtr[4] ==
','))
1177 else if(currentTagPtr[4] !=
'\0')
1181 "rmTags called with improperly formatted tags.\n");
1182 returnStatus =
false;
1193 myNeedToSetTagsInBuffer =
true;
1194 myIsTagsBufferValid =
false;
1195 myIsBufferSynced =
false;
1196 myTagBufferSize -= rmBuffSize;
1199 return(returnStatus);
1217 if((myIsBufferSynced ==
false) ||
1218 (myBufferSequenceTranslation != translation))
1220 status &= fixBuffer(translation);
1223 if(myNeedToSetTagsInBuffer)
1225 status &= setTagsInBuffer();
1231 return (
const void *)myRecordPtr;
1248 if((filePtr == NULL) || (filePtr->
isOpen() ==
false))
1252 "Can't write to an unopened file.");
1256 if((myIsBufferSynced ==
false) ||
1257 (myBufferSequenceTranslation != translation))
1259 if(!fixBuffer(translation))
1266 unsigned int numBytesToWrite = myRecordPtr->myBlockSize +
sizeof(int32_t);
1267 unsigned int numBytesWritten =
1268 ifwrite(filePtr, myRecordPtr, numBytesToWrite);
1271 if(numBytesToWrite == numBytesWritten)
1286 if(myIsBufferSynced ==
false)
1291 fixBuffer(myBufferSequenceTranslation);
1293 return myRecordPtr->myBlockSize;
1301 return myReferenceName.c_str();
1308 return myRecordPtr->myReferenceID;
1315 return (myRecordPtr->myPosition + 1);
1322 return myRecordPtr->myPosition;
1331 if(myIsReadNameBufferValid)
1333 return(myRecordPtr->myReadNameLength);
1336 return(myReadName.Length() + 1);
1343 return myRecordPtr->myMapQuality;
1354 myRecordPtr->myBin =
1356 myIsBinValid =
true;
1358 return(myRecordPtr->myBin);
1367 if(myIsCigarBufferValid)
1369 return myRecordPtr->myCigarLength;
1372 if(myCigarTempBufferLength == -1)
1380 return(myCigarTempBufferLength);
1387 return myRecordPtr->myFlag;
1394 if(myIsSequenceBufferValid ==
false)
1397 if((mySequence.Length() == 1) && (mySequence[0] ==
'*'))
1402 return(mySequence.Length());
1404 return(myRecordPtr->myReadLength);
1413 return myMateReferenceName.c_str();
1423 if(myMateReferenceName ==
"*")
1425 return(myMateReferenceName);
1429 return(FIELD_ABSENT_STRING);
1433 return(myMateReferenceName);
1441 return myRecordPtr->myMateReferenceID;
1448 return (myRecordPtr->myMatePosition + 1);
1455 return myRecordPtr->myMatePosition;
1462 return myRecordPtr->myInsertSize;
1470 if(myAlignmentLength == -1)
1476 if(myAlignmentLength == 0)
1479 return(myRecordPtr->myPosition);
1481 return(myRecordPtr->myPosition + myAlignmentLength - 1);
1496 if(myAlignmentLength == -1)
1502 return(myAlignmentLength);
1509 if(myUnclippedStartOffset == -1)
1514 return(myRecordPtr->myPosition - myUnclippedStartOffset);
1545 if(myReadName.Length() == 0)
1549 myReadName = (
char*)&(myRecordPtr->myData);
1551 return myReadName.c_str();
1558 if(myCigar.Length() == 0)
1564 return myCigar.c_str();
1577 if(mySequence.Length() == 0)
1581 setSequenceAndQualityFromBuffer();
1585 if((translation ==
NONE) || (myRefPtr == NULL))
1587 return mySequence.c_str();
1589 else if(translation ==
EQUAL)
1591 if(mySeqWithEq.length() == 0)
1594 if(mySequence ==
"*")
1603 myRecordPtr->myPosition,
1610 return(mySeqWithEq.c_str());
1615 if(mySeqWithoutEq.length() == 0)
1617 if(mySequence ==
"*")
1620 mySeqWithoutEq =
'*';
1626 myRecordPtr->myPosition,
1633 return(mySeqWithoutEq.c_str());
1641 if(myQuality.Length() == 0)
1645 setSequenceAndQualityFromBuffer();
1647 return myQuality.c_str();
1653 return(
getSequence(index, mySequenceTranslation));
1659 static const char * asciiBases =
"=AC.G...T......N";
1667 String exceptionString =
"SamRecord::getSequence(";
1668 exceptionString += index;
1669 exceptionString +=
") is not allowed since sequence = '*'";
1670 throw std::runtime_error(exceptionString.c_str());
1672 else if((index < 0) || (index >= readLen))
1675 String exceptionString =
"SamRecord::getSequence(";
1676 exceptionString += index;
1677 exceptionString +=
") is out of range. Index must be between 0 and ";
1678 exceptionString += (readLen - 1);
1679 throw std::runtime_error(exceptionString.c_str());
1683 if((translation ==
NONE) || (myRefPtr == NULL))
1686 if(mySequence.Length() == 0)
1689 if(myIsSequenceBufferValid)
1692 asciiBases[myPackedSequence[index / 2] & 0xF] :
1693 asciiBases[myPackedSequence[index / 2] >> 4]);
1697 String exceptionString =
"SamRecord::getSequence(";
1698 exceptionString += index;
1699 exceptionString +=
") called with no sequence set";
1700 throw std::runtime_error(exceptionString.c_str());
1704 return(mySequence[index]);
1711 if(mySequence.Length() == 0)
1715 setSequenceAndQualityFromBuffer();
1719 if(translation ==
EQUAL)
1723 if(mySeqWithEq.length() == 0)
1728 if(mySequence ==
"*")
1737 myRecordPtr->myPosition,
1745 return(mySeqWithEq[index]);
1752 if(mySeqWithoutEq.length() == 0)
1757 if(mySequence ==
"*")
1760 mySeqWithoutEq =
'*';
1768 myRecordPtr->myPosition,
1776 return(mySeqWithoutEq[index]);
1793 else if((index < 0) || (index >= readLen))
1796 String exceptionString =
"SamRecord::getQuality(";
1797 exceptionString += index;
1798 exceptionString +=
") is out of range. Index must be between 0 and ";
1799 exceptionString += (readLen - 1);
1800 throw std::runtime_error(exceptionString.c_str());
1803 if(myQuality.Length() == 0)
1807 return(myPackedQuality[index] + 33);
1812 if((myQuality.Length() == 1) && (myQuality[0] ==
'*'))
1817 else if(index >= myQuality.Length())
1822 String exceptionString =
"SamRecord::getQuality(";
1823 exceptionString += index;
1824 exceptionString +=
") is out of range. Index must be between 0 and ";
1825 exceptionString += (myQuality.Length() - 1);
1826 throw std::runtime_error(exceptionString.c_str());
1830 return(myQuality[index]);
1842 if(myAlignmentLength == -1)
1847 return(&myCigarRoller);
1857 if(myAlignmentLength == -1)
1869 return(
getFields(recStruct, readName, cigar, sequence, quality,
1870 mySequenceTranslation));
1880 if(myIsBufferSynced ==
false)
1882 if(!fixBuffer(translation))
1932 if(myNeedToSetTagsFromBuffer)
1936 unsigned char * tagStart =
1937 (
unsigned char *)myRecordPtr->myData
1938 + myRecordPtr->myReadNameLength
1939 + myRecordPtr->myCigarLength *
sizeof(
int)
1940 + (myRecordPtr->myReadLength + 1) / 2 + myRecordPtr->myReadLength;
1945 uint32_t nonTagSize =
1946 tagStart - (
unsigned char*)&(myRecordPtr->myReferenceID);
1948 uint32_t tagSize = myRecordPtr->myBlockSize - nonTagSize;
1953 return(myTagBufferSize);
1965 if(myNeedToSetTagsFromBuffer)
1967 if(!setTagsFromBuffer())
1978 int maxTagIndex = extras.Capacity();
1979 if(myLastTagIndex >= maxTagIndex)
1987 bool tagFound =
false;
1989 while((tagFound ==
false) && (myLastTagIndex < maxTagIndex))
1991 if(extras.SlotInUse(myLastTagIndex))
1994 int key = extras.GetKey(myLastTagIndex);
1996 getTypeFromKey(key, vtype);
2002 *value = getFloatPtr(myLastTagIndex);
2005 *value = getIntegerPtr(myLastTagIndex, vtype);
2014 *value = getStringPtr(myLastTagIndex);
2018 "Unknown tag type");
2036 myLastTagIndex = -1;
2042 if((vtype ==
'c') || (vtype ==
'C') ||
2043 (vtype ==
's') || (vtype ==
'S') ||
2044 (vtype ==
'i') || (vtype ==
'I'))
2074 if((vtype ==
'Z') || (vtype ==
'B'))
2084 const char* currentTagPtr = tags;
2086 returnString.Clear();
2088 if(myNeedToSetTagsFromBuffer)
2090 if(!setTagsFromBuffer())
2098 bool returnStatus =
true;
2100 while(*currentTagPtr !=
'\0')
2105 if((currentTagPtr[0] ==
'\0') || (currentTagPtr[1] ==
'\0') ||
2106 (currentTagPtr[2] !=
':') || (currentTagPtr[3] ==
'\0'))
2109 "getTagsString called with improperly formatted tags.\n");
2110 returnStatus =
false;
2115 int key = MAKEKEY(currentTagPtr[0], currentTagPtr[1],
2118 int offset = extras.Find(key);
2123 if(!returnString.IsEmpty())
2125 returnString += delim;
2127 returnString += currentTagPtr[0];
2128 returnString += currentTagPtr[1];
2129 returnString +=
':';
2130 returnString += currentTagPtr[3];
2131 returnString +=
':';
2135 getTypeFromKey(key, vtype);
2140 returnString += *(
int*)getIntegerPtr(offset, vtype);
2143 returnString += *(
float*)getFloatPtr(offset);
2147 returnString += *(
String*)getStringPtr(offset);
2151 "rmTag called with unknown type.\n");
2152 returnStatus =
false;
2157 if((currentTagPtr[4] ==
';') || (currentTagPtr[4] ==
','))
2162 else if(currentTagPtr[4] !=
'\0')
2166 "rmTags called with improperly formatted tags.\n");
2167 returnStatus =
false;
2176 return(returnStatus);
2183 if(myNeedToSetTagsFromBuffer)
2185 if(!setTagsFromBuffer())
2194 int key = MAKEKEY(tag[0], tag[1],
'Z');
2195 int offset = extras.Find(key);
2201 key = MAKEKEY(tag[0], tag[1],
'B');
2202 offset = extras.Find(key);
2211 value = extras[offset];
2212 return(&(strings[value]));
2221 if(myNeedToSetTagsFromBuffer)
2223 if(!setTagsFromBuffer())
2232 int key = MAKEKEY(tag[0], tag[1],
'i');
2233 int offset = extras.Find(key);
2242 value = extras[offset];
2244 return(&(integers[value]));
2253 if(myNeedToSetTagsFromBuffer)
2255 if(!setTagsFromBuffer())
2264 int key = MAKEKEY(tag[0], tag[1],
'i');
2265 int offset = extras.Find(key);
2274 value = extras[offset];
2276 tagVal = integers[value];
2286 if(myNeedToSetTagsFromBuffer)
2288 if(!setTagsFromBuffer())
2297 int key = MAKEKEY(tag[0], tag[1],
'f');
2298 int offset = extras.Find(key);
2307 value = extras[offset];
2309 tagVal = floats[value];
2319 if(myNeedToSetTagsFromBuffer)
2321 if(!setTagsFromBuffer())
2329 int key = MAKEKEY(tag[0], tag[1],
'Z');
2330 int offset = extras.Find(key);
2336 key = MAKEKEY(tag[0], tag[1],
'B');
2337 offset = extras.Find(key);
2341 return(NOT_FOUND_TAG_STRING);
2344 value = extras[offset];
2346 return strings[value];
2355 if(myNeedToSetTagsFromBuffer)
2357 if(!setTagsFromBuffer())
2365 int key = MAKEKEY(tag[0], tag[1],
'i');
2366 int offset = extras.Find(key);
2372 return NOT_FOUND_TAG_INT;
2375 value = extras[offset];
2377 return integers[value];
2386 if(myNeedToSetTagsFromBuffer)
2388 if(!setTagsFromBuffer())
2396 int key = MAKEKEY(tag[0], tag[1], type);
2398 return (extras.Find(key) != LH_NOTFOUND);
2412bool SamRecord::allocateRecordStructure(
int size)
2414 if (allocatedSize < size)
2418 if(tmpRecordPtr == NULL)
2421 fprintf(stderr,
"FAILED TO ALLOCATE MEMORY!!!");
2426 myRecordPtr = tmpRecordPtr;
2429 if(myIsSequenceBufferValid)
2431 myPackedSequence = (
unsigned char *)myRecordPtr->myData +
2432 myRecordPtr->myReadNameLength +
2433 myRecordPtr->myCigarLength *
sizeof(
int);
2435 if(myIsQualityBufferValid)
2437 myPackedQuality = (
unsigned char *)myRecordPtr->myData +
2438 myRecordPtr->myReadNameLength +
2439 myRecordPtr->myCigarLength *
sizeof(
int) +
2440 (myRecordPtr->myReadLength + 1) / 2;
2443 allocatedSize = size;
2450void* SamRecord::getStringPtr(
int index)
2452 int value = extras[index];
2454 return &(strings[value]);
2457void* SamRecord::getIntegerPtr(
int offset,
char& type)
2459 int value = extras[offset];
2461 type = intType[value];
2463 return &(integers[value]);
2466void* SamRecord::getFloatPtr(
int offset)
2468 int value = extras[offset];
2470 return &(floats[value]);
2475bool SamRecord::fixBuffer(SequenceTranslation translation)
2478 if(myIsBufferSynced &&
2479 (myBufferSequenceTranslation == translation))
2490 myRecordPtr->myBin =
2492 myIsBinValid =
true;
2503 uint32_t bamSequenceLen = (newReadLen+1)/2;
2509 ((
unsigned char*)(&(myRecordPtr->myData)) -
2510 (
unsigned char*)myRecordPtr) +
2511 newReadNameLen + ((newCigarLen)*4) +
2512 newReadLen + bamSequenceLen + newTagLen;
2514 if(!allocateRecordStructure(newBufferSize))
2525 bool readNameLenChange = (newReadNameLen != myRecordPtr->myReadNameLength);
2526 bool cigarLenChange = (newCigarLen != myRecordPtr->myCigarLength);
2527 bool readLenChange = (newReadLen != myRecordPtr->myReadLength);
2531 if(myIsTagsBufferValid &&
2532 (readNameLenChange | cigarLenChange | readLenChange))
2534 status &= setTagsFromBuffer();
2537 myIsTagsBufferValid =
false;
2543 if((myIsQualityBufferValid | myIsSequenceBufferValid) &&
2544 (readNameLenChange | cigarLenChange | readLenChange))
2546 setSequenceAndQualityFromBuffer();
2549 myIsQualityBufferValid =
false;
2550 myIsSequenceBufferValid =
false;
2555 if((myIsCigarBufferValid) &&
2556 (readNameLenChange))
2558 status &= parseCigarBinary();
2559 myIsCigarBufferValid =
false;
2563 if(!myIsReadNameBufferValid)
2565 memcpy(&(myRecordPtr->myData), myReadName.c_str(),
2569 myRecordPtr->myReadNameLength = newReadNameLen;
2570 myIsReadNameBufferValid =
true;
2573 unsigned char * readNameEnds = (
unsigned char*)(&(myRecordPtr->myData)) +
2574 myRecordPtr->myReadNameLength;
2577 unsigned int * packedCigar = (
unsigned int *) (
void *) readNameEnds;
2579 if(!myIsCigarBufferValid)
2583 myRecordPtr->myCigarLength = newCigarLen;
2584 memcpy(packedCigar, myCigarTempBuffer,
2585 myRecordPtr->myCigarLength *
sizeof(uint32_t));
2587 myIsCigarBufferValid =
true;
2590 unsigned char * packedSequence = readNameEnds +
2591 myRecordPtr->myCigarLength *
sizeof(int);
2592 unsigned char * packedQuality = packedSequence + bamSequenceLen;
2594 if(!myIsSequenceBufferValid || !myIsQualityBufferValid ||
2595 (myBufferSequenceTranslation != translation))
2597 myRecordPtr->myReadLength = newReadLen;
2600 bool noQuality =
false;
2601 if((myQuality.Length() == 1) && (myQuality[0] ==
'*'))
2606 const char* translatedSeq = NULL;
2610 if((!myIsSequenceBufferValid) ||
2611 (translation != myBufferSequenceTranslation))
2616 for (
int i = 0; i < myRecordPtr->myReadLength; i++)
2618 if((!myIsSequenceBufferValid) ||
2619 (translation != myBufferSequenceTranslation))
2623 switch(translatedSeq[i])
2651 "Unknown Sequence character found.");
2660 packedSequence[i/2] |= seqVal;
2665 packedSequence[i/2] = seqVal << 4;
2669 if(!myIsQualityBufferValid)
2672 if((noQuality) || (myQuality.Length() <= i))
2676 packedQuality[i] = 0xFF;
2681 packedQuality[i] = myQuality[i] - 33;
2685 myPackedSequence = (
unsigned char *)myRecordPtr->myData +
2686 myRecordPtr->myReadNameLength +
2687 myRecordPtr->myCigarLength *
sizeof(
int);
2688 myPackedQuality = myPackedSequence +
2689 (myRecordPtr->myReadLength + 1) / 2;
2690 myIsSequenceBufferValid =
true;
2691 myIsQualityBufferValid =
true;
2692 myBufferSequenceTranslation = translation;
2695 if(!myIsTagsBufferValid)
2697 status &= setTagsInBuffer();
2701 myRecordPtr->myReadNameLength = newReadNameLen;
2702 myRecordPtr->myCigarLength = newCigarLen;
2703 myRecordPtr->myReadLength = newReadLen;
2707 myRecordPtr->myBlockSize = newBufferSize -
sizeof(int32_t);
2711 myIsBufferSynced =
true;
2721void SamRecord::setSequenceAndQualityFromBuffer()
2729 bool extractSequence =
false;
2730 if(myIsSequenceBufferValid && (mySequence.Length() == 0))
2732 extractSequence =
true;
2736 bool extractQuality =
false;
2737 if(myIsQualityBufferValid && (myQuality.Length() == 0))
2739 extractQuality =
true;
2744 if(!extractSequence && !extractQuality)
2752 mySequence.SetLength(myRecordPtr->myReadLength);
2756 myQuality.SetLength(myRecordPtr->myReadLength);
2759 const char * asciiBases =
"=AC.G...T......N";
2763 bool qualitySpecified =
false;
2765 for (
int i = 0; i < myRecordPtr->myReadLength; i++)
2769 mySequence[i] = i & 1 ?
2770 asciiBases[myPackedSequence[i / 2] & 0xF] :
2771 asciiBases[myPackedSequence[i / 2] >> 4];
2776 if(myPackedQuality[i] != 0xFF)
2779 qualitySpecified =
true;
2782 myQuality[i] = myPackedQuality[i] + 33;
2787 if(myRecordPtr->myReadLength == 0)
2798 else if(extractQuality && !qualitySpecified)
2807bool SamRecord::parseCigar()
2810 if(myCigar.Length() == 0)
2813 return(parseCigarBinary());
2815 return(parseCigarString());
2819bool SamRecord::parseCigarBinary()
2824 if(myCigar.Length() != 0)
2830 unsigned char * readNameEnds =
2831 (
unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength;
2833 unsigned int * packedCigar = (
unsigned int *) (
void *) readNameEnds;
2835 myCigarRoller.
Set(packedCigar, myRecordPtr->myCigarLength);
2845 if(myRecordPtr->myCigarLength == 0)
2852 int newBufferSize = myRecordPtr->myCigarLength *
sizeof(uint32_t);
2853 if(newBufferSize > myCigarTempBufferAllocatedSize)
2855 uint32_t* tempBufferPtr =
2856 (uint32_t*)realloc(myCigarTempBuffer, newBufferSize);
2857 if(tempBufferPtr == NULL)
2861 fprintf(stderr,
"FAILED TO ALLOCATE MEMORY!!!");
2863 "Failed to Allocate Memory.");
2866 myCigarTempBuffer = tempBufferPtr;
2867 myCigarTempBufferAllocatedSize = newBufferSize;
2870 memcpy(myCigarTempBuffer, packedCigar,
2871 myRecordPtr->myCigarLength *
sizeof(uint32_t));
2874 myCigarTempBufferLength = myRecordPtr->myCigarLength;
2880bool SamRecord::parseCigarString()
2882 myCigarTempBufferLength = 0;
2886 myAlignmentLength = 0;
2887 myUnclippedStartOffset = 0;
2888 myUnclippedEndOffset = 0;
2889 myCigarRoller.
clear();
2893 myCigarRoller.
Set(myCigar);
2903 int newBufferSize = myCigar.Length() *
sizeof(uint32_t);
2904 if(newBufferSize > myCigarTempBufferAllocatedSize)
2906 uint32_t* tempBufferPtr =
2907 (uint32_t*)realloc(myCigarTempBuffer, newBufferSize);
2908 if(tempBufferPtr == NULL)
2912 fprintf(stderr,
"FAILED TO ALLOCATE MEMORY!!!");
2914 "Failed to Allocate Memory.");
2917 myCigarTempBuffer = tempBufferPtr;
2918 myCigarTempBufferAllocatedSize = newBufferSize;
2926 const char* cigarEntryStart = myCigar.c_str();
2930 unsigned int * packedCigar = myCigarTempBuffer;
2933 const char* endCigarString = cigarEntryStart + myCigar.Length();
2934 while(cigarEntryStart < endCigarString)
2936 bool validCigarEntry =
true;
2939 opLen = strtol(cigarEntryStart, &cigarOp, 10);
2967 fprintf(stderr,
"ERROR parsing cigar\n");
2968 validCigarEntry =
false;
2971 "Unknown operation found when parsing the Cigar.");
2977 ++myCigarTempBufferLength;
2978 *packedCigar = (opLen << 4) | op;
2982 cigarEntryStart = ++cigarOp;
2990bool SamRecord::setTagsFromBuffer()
2993 if(myNeedToSetTagsFromBuffer ==
false)
3000 myNeedToSetTagsFromBuffer =
false;
3002 unsigned char * extraPtr = myPackedQuality + myRecordPtr->myReadLength;
3009 while (myRecordPtr->myBlockSize + 4 -
3010 (extraPtr - (
unsigned char *)myRecordPtr) > 0)
3014 void * content = extraPtr + 3;
3015 int tagBufferSize = 0;
3017 key = MAKEKEY(extraPtr[0], extraPtr[1], extraPtr[2]);
3020 unsigned int location = extras.Find(key);
3022 String* duplicate = NULL;
3024 if(location != LH_NOTFOUND)
3028 origIndex = extras[location];
3030 *duplicate = (char)(extraPtr[0]);
3031 *duplicate += (char)(extraPtr[1]);
3034 *origTag = *duplicate;
3035 *duplicate += (char)(extraPtr[2]);
3039 switch (extraPtr[2])
3042 if(duplicate != NULL)
3044 *duplicate += (* (
char *) content);
3045 *origTag += intType[origIndex];
3047 appendIntArrayValue(origIndex, *origTag);
3048 tagBufferSize -= getNumericTagTypeSize(intType[origIndex]);
3049 integers[origIndex] = *(
char *)content;
3050 intType[origIndex] = extraPtr[2];
3051 tagBufferSize += getNumericTagTypeSize(intType[origIndex]);
3055 value = integers.Length();
3056 integers.Push(* (
char *) content);
3057 intType.push_back(extraPtr[2]);
3063 if(duplicate != NULL)
3065 *duplicate += (* (
char *) content);
3066 *origTag += intType[origIndex];
3068 appendIntArrayValue(origIndex, *origTag);
3069 tagBufferSize -= getNumericTagTypeSize(intType[origIndex]);
3070 integers[origIndex] = *(
char *)content;
3071 intType[origIndex] = extraPtr[2];
3072 tagBufferSize += getNumericTagTypeSize(intType[origIndex]);
3076 value = integers.Length();
3077 integers.Push(* (
char *) content);
3078 intType.push_back(extraPtr[2]);
3084 if(duplicate != NULL)
3086 *duplicate += (* (
unsigned char *) content);
3087 *origTag += intType[origIndex];
3089 appendIntArrayValue(origIndex, *origTag);
3090 tagBufferSize -= getNumericTagTypeSize(intType[origIndex]);
3091 integers[origIndex] = *(
unsigned char *)content;
3092 intType[origIndex] = extraPtr[2];
3093 tagBufferSize += getNumericTagTypeSize(intType[origIndex]);
3097 value = integers.Length();
3098 integers.Push(* (
unsigned char *) content);
3099 intType.push_back(extraPtr[2]);
3105 if(duplicate != NULL)
3107 *duplicate += (* (
short *) content);
3108 *origTag += intType[origIndex];
3110 appendIntArrayValue(origIndex, *origTag);
3111 tagBufferSize -= getNumericTagTypeSize(intType[origIndex]);
3112 integers[origIndex] = *(
short *)content;
3113 intType[origIndex] = extraPtr[2];
3114 tagBufferSize += getNumericTagTypeSize(intType[origIndex]);
3118 value = integers.Length();
3119 integers.Push(* (
short *) content);
3120 intType.push_back(extraPtr[2]);
3126 if(duplicate != NULL)
3128 *duplicate += (* (
unsigned short *) content);
3129 *origTag += intType[origIndex];
3131 appendIntArrayValue(origIndex, *origTag);
3132 tagBufferSize -= getNumericTagTypeSize(intType[origIndex]);
3133 integers[origIndex] = *(
unsigned short *)content;
3134 intType[origIndex] = extraPtr[2];
3135 tagBufferSize += getNumericTagTypeSize(intType[origIndex]);
3139 value = integers.Length();
3140 integers.Push(* (
unsigned short *) content);
3141 intType.push_back(extraPtr[2]);
3147 if(duplicate != NULL)
3149 *duplicate += (* (
int *) content);
3150 *origTag += intType[origIndex];
3152 appendIntArrayValue(origIndex, *origTag);
3153 tagBufferSize -= getNumericTagTypeSize(intType[origIndex]);
3154 integers[origIndex] = *(
int *)content;
3155 intType[origIndex] = extraPtr[2];
3156 tagBufferSize += getNumericTagTypeSize(intType[origIndex]);
3160 value = integers.Length();
3161 integers.Push(* (
int *) content);
3162 intType.push_back(extraPtr[2]);
3168 if(duplicate != NULL)
3170 *duplicate += (* (
unsigned int *) content);
3171 *origTag += intType[origIndex];
3173 appendIntArrayValue(origIndex, *origTag);
3174 tagBufferSize -= getNumericTagTypeSize(intType[origIndex]);
3175 integers[origIndex] = *(
unsigned int *)content;
3176 intType[origIndex] = extraPtr[2];
3177 tagBufferSize += getNumericTagTypeSize(intType[origIndex]);
3181 value = integers.Length();
3182 integers.Push((
int) * (
unsigned int *) content);
3183 intType.push_back(extraPtr[2]);
3189 if(duplicate != NULL)
3191 *duplicate += ((
const char *) content);
3194 *origTag += (
char*)(strings[origIndex]);
3195 tagBufferSize -= strings[origIndex].Length();
3196 strings[origIndex] = (
const char *) content;
3197 extraPtr += 4 + strings[origIndex].Length();
3198 tagBufferSize += strings[origIndex].Length();
3202 value = strings.Length();
3203 strings.Push((
const char *) content);
3204 tagBufferSize += 4 + strings.Last().Length();
3205 extraPtr += 4 + strings.Last().Length();
3209 if(duplicate != NULL)
3213 *origTag += (
char*)(strings[origIndex]);
3215 getBtagBufferSize(strings[origIndex]);
3217 getStringFromBtagBuffer((
unsigned char*)content,
3218 strings[origIndex]);
3219 *duplicate += (
char *)(strings[origIndex]);
3220 tagBufferSize += bufferSize;
3221 extraPtr += 3 + bufferSize;
3225 value = strings.Length();
3228 getStringFromBtagBuffer((
unsigned char*)content,
3230 strings.Push(tempBTag);
3231 tagBufferSize += 3 + bufferSize;
3232 extraPtr += 3 + bufferSize;
3236 if(duplicate != NULL)
3238 duplicate->appendFullFloat(* (
float *) content);
3241 origTag->appendFullFloat(floats[origIndex]);
3242 floats[origIndex] = *(
float *)content;
3246 value = floats.size();
3247 floats.push_back(* (
float *) content);
3254 "parsing BAM - Unknown custom field of type %c%c:%c\n",
3255 extraPtr[0], extraPtr[1], extraPtr[2]);
3256 fprintf(stderr,
"BAM Tags: \n");
3258 unsigned char* tagInfo = myPackedQuality + myRecordPtr->myReadLength;
3260 fprintf(stderr,
"\n\n");
3261 tagInfo = myPackedQuality + myRecordPtr->myReadLength;
3262 while(myRecordPtr->myBlockSize + 4 -
3263 (tagInfo - (
unsigned char *)myRecordPtr) > 0)
3265 fprintf(stderr,
"%02x",tagInfo[0]);
3268 fprintf(stderr,
"\n");
3274 "Unknown tag type.");
3278 if(duplicate != NULL)
3283 if(myNumWarns++ < myMaxWarns)
3285 fprintf(stderr,
"WARNING: Duplicate Tags, overwritting %s with %s\n",
3286 origTag->c_str(), duplicate->c_str());
3287 if(myNumWarns == myMaxWarns)
3289 fprintf(stderr,
"Suppressing rest of Duplicate Tag warnings.\n");
3300 extras.Add(key, value);
3301 myTagBufferSize += tagBufferSize;
3308bool SamRecord::setTagsInBuffer()
3313 int bamSequenceLength = (myRecordPtr->myReadLength+1)/2;
3314 int newBufferSize = ((
unsigned char*)(&(myRecordPtr->myData)) -
3315 (
unsigned char*)myRecordPtr) +
3316 myRecordPtr->myReadNameLength + ((myRecordPtr->myCigarLength)*4) +
3317 myRecordPtr->myReadLength + bamSequenceLength + myTagBufferSize;
3320 if(!allocateRecordStructure(newBufferSize))
3326 char * extraPtr = (
char*)myPackedQuality + myRecordPtr->myReadLength;
3331 if (extras.Entries())
3333 for (
int i = 0; i < extras.Capacity(); i++)
3335 if (extras.SlotInUse(i))
3337 int key = extras.GetKey(i);
3338 getTag(key, extraPtr);
3341 getTypeFromKey(key, vtype);
3345 vtype = getIntegerType(i);
3348 extraPtr[0] = vtype;
3366 *(uint8_t*)extraPtr = (uint8_t)
getInteger(i);
3371 *(int16_t*)extraPtr = (int16_t)
getInteger(i);
3376 *(uint16_t*)extraPtr = (uint16_t)
getInteger(i);
3381 *(int32_t*)extraPtr = (int32_t)
getInteger(i);
3386 *(uint32_t*)extraPtr = (uint32_t)
getInteger(i);
3391 sprintf(extraPtr,
"%s",
getString(i).c_str());
3395 extraPtr += setBtagBuffer(
getString(i), extraPtr);
3401 *(
float*)extraPtr = getFloat(i);
3406 "Unknown tag type.");
3416 if(extraPtr != (
char*)myRecordPtr + newBufferSize)
3418 fprintf(stderr,
"ERROR updating the buffer. Incorrect size.");
3420 "ERROR updating the buffer. Incorrect size.");
3426 myNeedToSetTagsInBuffer =
false;
3427 myIsTagsBufferValid =
true;
3435void SamRecord::setVariablesForNewBuffer(
SamFileHeader& header)
3441 myMateReferenceName =
3445 myReadName.SetLength(0);
3446 myCigar.SetLength(0);
3447 mySequence.SetLength(0);
3448 mySeqWithEq.clear();
3449 mySeqWithoutEq.clear();
3450 myQuality.SetLength(0);
3451 myNeedToSetTagsFromBuffer =
true;
3452 myNeedToSetTagsInBuffer =
false;
3455 myIsBufferSynced =
true;
3457 myIsReadNameBufferValid =
true;
3458 myIsCigarBufferValid =
true;
3459 myPackedSequence = (
unsigned char *)myRecordPtr->myData +
3460 myRecordPtr->myReadNameLength + myRecordPtr->myCigarLength *
sizeof(
int);
3461 myIsSequenceBufferValid =
true;
3462 myBufferSequenceTranslation =
NONE;
3463 myPackedQuality = myPackedSequence +
3464 (myRecordPtr->myReadLength + 1) / 2;
3465 myIsQualityBufferValid =
true;
3466 myIsTagsBufferValid =
true;
3467 myIsBinValid =
true;
3472void SamRecord::getTypeFromKey(
int key,
char& type)
const
3475 type = (key >> 16) & 0xFF;
3480void SamRecord::getTag(
int key,
char* tag)
const
3483 tag[0] = key & 0xFF;
3484 tag[1] = (key >> 8) & 0xFF;
3492 int value = extras[index];
3494 return strings[value];
3499 int value = extras[offset];
3501 return integers[value];
3504const char & SamRecord::getIntegerType(
int offset)
const
3506 int value = extras[offset];
3508 return intType[value];
3511float & SamRecord::getFloat(
int offset)
3513 int value = extras[offset];
3515 return floats[value];
3519void SamRecord::appendIntArrayValue(
char type,
int value,
String& strVal)
const
3524 strVal += (char)value;
3541int SamRecord::getBtagBufferSize(
String& tagStr)
3543 if(tagStr.Length() < 1)
3547 "SamRecord::getBtagBufferSize no tag subtype specified");
3550 char type = tagStr[0];
3551 int elementSize = getNumericTagTypeSize(type);
3552 if(elementSize <= 0)
3555 String errorMsg =
"SamRecord::getBtagBufferSize invalid tag subtype, ";
3562 int numElements = 0;
3563 int index = tagStr.FastFindChar(
',', 0);
3567 index = tagStr.FastFindChar(
',', index+1);
3570 return(numElements * elementSize + 5);
3574int SamRecord::setBtagBuffer(
String& tagStr,
char* extraPtr)
3576 if(tagStr.Length() < 1)
3580 "SamRecord::getBtagBufferSize no tag subtype specified");
3583 char type = tagStr[0];
3584 int elementSize = getNumericTagTypeSize(type);
3585 if(elementSize <= 0)
3588 String errorMsg =
"SamRecord::getBtagBufferSize invalid tag subtype, ";
3596 *(
char*)extraPtr = type;
3601 uint32_t numElements = 0;
3602 int index = tagStr.FastFindChar(
',', 0);
3606 index = tagStr.FastFindChar(
',', index+1);
3608 *(uint32_t*)extraPtr = numElements;
3612 const char* stringPtr = tagStr.c_str();
3613 const char* endPtr = stringPtr + tagStr.Length();
3617 char* newPtr = NULL;
3618 while(stringPtr < endPtr)
3623 *(
float*)extraPtr = (
float)(strtod(stringPtr, &newPtr));
3626 *(int8_t*)extraPtr = (int8_t)strtol(stringPtr, &newPtr, 0);
3629 *(int16_t*)extraPtr = (int16_t)strtol(stringPtr, &newPtr, 0);
3632 *(int32_t*)extraPtr = (int32_t)strtol(stringPtr, &newPtr, 0);
3635 *(uint8_t*)extraPtr = (uint8_t)strtoul(stringPtr, &newPtr, 0);
3638 *(uint16_t*)extraPtr = (uint16_t)strtoul(stringPtr, &newPtr, 0);
3641 *(uint32_t*)extraPtr = (uint32_t)strtoul(stringPtr, &newPtr, 0);
3645 "Unknown 'B' tag subtype.");
3648 extraPtr += elementSize;
3649 totalInc += elementSize;
3650 stringPtr = newPtr + 1;
3656int SamRecord::getStringFromBtagBuffer(
unsigned char* buffer,
3664 char type = *buffer;
3670 unsigned int numEntries = *(
unsigned int *)buffer;
3675 int subtypeSize = getNumericTagTypeSize(type);
3677 for(
unsigned int i = 0; i < numEntries; i++)
3683 tagStr.appendFullFloat(*(
float *)buffer);
3686 tagStr += *(int8_t *)buffer;
3689 tagStr += *(int16_t *)buffer;
3692 tagStr += *(int32_t *)buffer;
3695 tagStr += *(uint8_t *)buffer;
3698 tagStr += *(uint16_t *)buffer;
3701 tagStr += *(uint32_t *)buffer;
3705 "Unknown 'B' tag subtype.");
3708 buffer += subtypeSize;
3709 bufferSize += subtypeSize;
static const char UNKNOWN_QUALITY_CHAR
Character used when the quality is unknown.
bool Remove(int index)
Remove the operation at the specified index.
bool IncrementCount(int index, int increment)
Increments the count for the operation at the specified index by the specified value,...
bool Update(int index, Operation op, int count)
Updates the operation at the specified index to be the specified operation and have the specified cou...
void clear()
Clear this object so that it has no Cigar Operations.
void Set(const char *cigarString)
Sets this object to the specified cigarString.
This class represents the CIGAR without any methods to set the cigar (see CigarRoller for that).
int size() const
Return the number of cigar operations.
static bool isMatchOrMismatch(Operation op)
Return true if the specified operation is a match/mismatch operation, false if not.
static bool foundInQuery(Operation op)
Return true if the specified operation is found in the query sequence, false if not.
int getNumBeginClips() const
Return the number of clips that are at the beginning of the cigar.
int getNumEndClips() const
Return the number of clips that are at the end of the cigar.
int getExpectedReferenceBaseCount() const
Return the number of bases in the reference that this CIGAR "spans".
@ insert
insertion to the reference (the query sequence contains bases that have no corresponding base in the ...
uint32_t getNumOverlaps(int32_t start, int32_t end, int32_t queryStartPos)
Return the number of bases that overlap the reference and the read associated with this cigar that fa...
void getCigarString(String &cigarString) const
Set the passed in String to the string reprentation of the Cigar operations in this object.
HandlingType
This specifies how this class should respond to errors.
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
static void seqWithoutEquals(const char *currentSeq, int32_t seq0BasedPos, Cigar &cigar, const char *referenceName, const GenomeSequence &refSequence, std::string &updatedSeq)
Gets the sequence converting '=' to the appropriate base using the reference.
static void seqWithEquals(const char *currentSeq, int32_t seq0BasedPos, Cigar &cigar, const char *referenceName, const GenomeSequence &refSequence, std::string &updatedSeq)
Gets the sequence with '=' in any position where the sequence matches the reference.
int32_t getBlockSize()
Get the block size of the record (BAM format).
uint16_t getCigarLength()
Get the length of the BAM formatted CIGAR.
const char * getReferenceName()
Get the reference sequence name (RNAME) of the record.
SequenceTranslation
Enum containing the settings on how to translate the sequence if a reference is available.
@ NONE
Leave the sequence as is.
@ BASES
Translate '=' to the actual base.
@ EQUAL
Translate bases that match the reference to '='.
bool setReadName(const char *readName)
Set QNAME to the passed in name.
int32_t getInsertSize()
Get the inferred insert size of the read pair (ISIZE) or observed template length (TLEN).
int32_t get0BasedMatePosition()
Get the 0-based(BAM) leftmost mate/next fragment's position.
int32_t get1BasedPosition()
Get the 1-based(SAM) leftmost position (POS) of the record.
void clearTags()
Clear the tags in this record.
bool addIntTag(const char *tag, int32_t value)
Add the specified integer tag to the record.
int32_t getReferenceID()
Get the reference sequence id of the record (BAM format rid).
bool getTagsString(const char *tags, String &returnString, char delim='\t')
Get the string representation of the tags from the record, formatted as TAG:TYPE:VALUE<delim>TAG:TYPE...
GenomeSequence * getReference()
Returns a pointer to the genome sequence object associated with this record if it was set (NULL if it...
int32_t getAlignmentLength()
Returns the length of the clipped sequence, returning 0 if the cigar is '*'.
int & getInteger(const char *tag)
Get the integer value for the specified tag, DEPRECATED, use getIntegerTag that returns a bool.
bool setInsertSize(int32_t insertSize)
Sets the inferred insert size (ISIZE)/observed template length (TLEN).
int32_t get1BasedAlignmentEnd()
Returns the 1-based inclusive rightmost position of the clipped sequence.
uint32_t getTagLength()
Returns the length of the BAM formatted tags.
SamRecord()
Default Constructor.
static bool isIntegerType(char vtype)
Returns whether or not the specified vtype is an integer type.
bool rmTag(const char *tag, char type)
Remove a tag.
bool setMateReferenceName(SamFileHeader &header, const char *mateReferenceName)
Set the mate/next fragment's reference sequence name (RNEXT) to the specified name,...
uint8_t getReadNameLength()
Get the length of the readname (QNAME) including the null.
Cigar * getCigarInfo()
Returns a pointer to the Cigar object associated with this record.
bool getFloatTag(const char *tag, float &tagVal)
Get the float value for the specified tag.
SamStatus::Status writeRecordBuffer(IFILE filePtr)
Write the record as a BAM into the specified already opened file.
const char * getMateReferenceNameOrEqual()
Get the mate/next fragment's reference sequence name (RNEXT), returning "=" if it is the same as the ...
bool setMapQuality(uint8_t mapQuality)
Set the mapping quality (MAPQ).
static bool isFloatType(char vtype)
Returns whether or not the specified vtype is a float type.
SamStatus::Status setBuffer(const char *fromBuffer, uint32_t fromBufferSize, SamFileHeader &header)
Sets the SamRecord to contain the information in the BAM formatted fromBuffer.
int32_t get1BasedUnclippedStart()
Returns the 1-based inclusive left-most position adjusted for clipped bases.
bool addTag(const char *tag, char vtype, const char *value)
Add the specified tag,vtype,value to the record.
uint16_t getBin()
Get the BAM bin for the record.
bool isValid(SamFileHeader &header)
Returns whether or not the record is valid, setting the status to indicate success or failure.
int32_t getMateReferenceID()
Get the mate reference id of the record (BAM format: mate_rid/next_refID).
bool getFields(bamRecordStruct &recStruct, String &readName, String &cigar, String &sequence, String &quality)
Returns the values of all fields except the tags.
bool set0BasedMatePosition(int32_t matePosition)
Set the mate/next fragment's leftmost position using the specified 0-based (BAM format) value.
void resetRecord()
Reset the fields of the record to a default value.
bool setFlag(uint16_t flag)
Set the bitwise FLAG to the specified value.
bool set1BasedPosition(int32_t position)
Set the leftmost position (POS) using the specified 1-based (SAM format) value.
SamStatus::Status setBufferFromFile(IFILE filePtr, SamFileHeader &header)
Read the BAM record from a file.
uint16_t getFlag()
Get the flag (FLAG).
const void * getRecordBuffer()
Get a const pointer to the buffer that contains the BAM representation of the record.
void setSequenceTranslation(SequenceTranslation translation)
Set the type of sequence translation to use when getting the sequence.
int32_t get1BasedMatePosition()
Get the 1-based(SAM) leftmost mate/next fragment's position (PNEXT).
int32_t get0BasedUnclippedEnd()
Returns the 0-based inclusive right-most position adjusted for clipped bases.
bool shiftIndelsLeft()
Shift the indels (if any) to the left by updating the CIGAR.
int * getIntegerTag(const char *tag)
Get the integer value for the specified tag, DEPRECATED, use one that returns a bool (success/failure...
const SamStatus & getStatus()
Returns the status associated with the last method that sets the status.
static bool isCharType(char vtype)
Returns whether or not the specified vtype is a char type.
bool setCigar(const char *cigar)
Set the CIGAR to the specified SAM formatted cigar string.
int32_t get1BasedUnclippedEnd()
Returns the 1-based inclusive right-most position adjusted for clipped bases.
uint32_t getNumOverlaps(int32_t start, int32_t end)
Return the number of bases in this read that overlap the passed in region.
const char * getMateReferenceName()
Get the mate/next fragment's reference sequence name (RNEXT).
bool checkTag(const char *tag, char type)
Check if the specified tag contains a value of the specified vtype.
bool getNextSamTag(char *tag, char &vtype, void **value)
Get the next tag from the record.
void setReference(GenomeSequence *reference)
Set the reference to the specified genome sequence object.
bool setSequence(const char *seq)
Sets the sequence (SEQ) to the specified SAM formatted sequence string.
int32_t get0BasedUnclippedStart()
Returns the 0-based inclusive left-most position adjusted for clipped bases.
int32_t getReadLength()
Get the length of the read.
int32_t get0BasedAlignmentEnd()
Returns the 0-based inclusive rightmost position of the clipped sequence.
const String * getStringTag(const char *tag)
Get the string value for the specified tag.
bool set1BasedMatePosition(int32_t matePosition)
Set the mate/next fragment's leftmost position (PNEXT) using the specified 1-based (SAM format) value...
int32_t get0BasedPosition()
Get the 0-based(BAM) leftmost position of the record.
const char * getCigar()
Returns the SAM formatted CIGAR string.
uint8_t getMapQuality()
Get the mapping quality (MAPQ) of the record.
const String & getString(const char *tag)
Get the string value for the specified tag.
bool set0BasedPosition(int32_t position)
Set the leftmost position using the specified 0-based (BAM format) value.
const char * getReadName()
Returns the SAM formatted Read Name (QNAME).
void resetTagIter()
Reset the tag iterator to the beginning of the tags.
bool setQuality(const char *quality)
Sets the quality (QUAL) to the specified SAM formatted quality string.
bool setReferenceName(SamFileHeader &header, const char *referenceName)
Set the reference sequence name (RNAME) to the specified name, using the header to determine the refe...
const char * getQuality()
Returns the SAM formatted quality string (QUAL).
const char * getSequence()
Returns the SAM formatted sequence string (SEQ), translating the base as specified by setSequenceTran...
bool rmTags(const char *tags)
Remove tags.
static bool isStringType(char vtype)
Returns whether or not the specified vtype is a string type.
The SamValidationErrors class is a container class that holds SamValidationError Objects,...
void getErrorString(std::string &errorString) const
Append the error messages contained in this container to the passed in string.
static bool isValid(SamFileHeader &samHeader, SamRecord &samRecord, SamValidationErrors &validationErrors)
Validates whether or not the specified SamRecord is valid, calling all of the other validations.
This class is used to track the status results of some methods in the BAM classes.
Status
Return value enum for StatGenFile methods.
@ FAIL_MEM
fail a memory allocation.
@ NO_MORE_RECS
NO_MORE_RECS: failed to read a record since there are no more to read either in the file or section i...
@ SUCCESS
method completed successfully.
@ FAIL_IO
method failed due to an I/O issue.
@ INVALID
invalid other than for sorting.
@ FAIL_PARSE
failed to parse a record/header - invalid format.
@ FAIL_ORDER
FAIL_ORDER: method failed because it was called out of order, like trying to read a file without open...
void setStatus(Status newStatus, const char *newMessage)
Set the status with the specified status enum and message.
Status getStatus() const
Return the enum for this status object.
void addError(Status newStatus, const char *newMessage)
Add the specified error message to the status message, setting the status to newStatus if the current...
Structure of a BAM record.