struct
type t = {
ref_id : int ;
pos : int ;
bin_mq_nl : int32 ;
flag_nc : int32 ;
next_ref_id : int ;
pnext : int ;
tlen : int ;
read_name : string ;
cigar : string ;
seq : string ;
qual : string ;
optional : string ;
} with sexp
let option ~none x =
if x = none then None
else Some x
let ref_id al = option ~none:(- 1) al.ref_id
let qname al = option ~none:"*" al.read_name
let flags al =
Int32.shift_right al.flag_nc 16
|> Int32.to_int_exn
|> Sam.Flags.of_int
let rname al header =
try Ok (Option.map (ref_id al) ~f:(fun id -> (Array.get header.ref_seq id).Sam.name))
with _ -> error_string "Bam.Alignment0.rname: unknown ref_id"
let pos al = option ~none:(- 1) al.pos
let mapq al = option ~none:255 (get_16_8 al.bin_mq_nl)
let cigar_op_of_s32 x : Sam.cigar_op Or_error.t =
let open Sam in
let op_len = get_32_4 x in
match get_4_0 x with
| 0 -> cigar_op_alignment_match op_len
| 1 -> cigar_op_insertion op_len
| 2 -> cigar_op_deletion op_len
| 3 -> cigar_op_skipped op_len
| 4 -> cigar_op_soft_clipping op_len
| 5 -> cigar_op_hard_clipping op_len
| 6 -> cigar_op_padding op_len
| 7 -> cigar_op_seq_match op_len
| 8 -> cigar_op_seq_mismatch op_len
| _ -> assert false
let cigar al =
result_list_init (String.length al.cigar / 4) ~f:(fun i ->
let s32 = Binary_packing.unpack_signed_32 ~byte_order:`Little_endian ~buf:al.cigar ~pos:(i * 4) in
cigar_op_of_s32 s32
)
let rnext al header =
match al.next_ref_id with
| -1 -> return None
| i -> Sam.parse_rnext header.ref_seq.(i).Sam.name
let pnext al = option ~none:(- 1) al.pnext
let tlen al = option ~none:0 al.tlen
let l_seq al = String.length al.qual
let char_of_seq_code = function
| 0 -> '='
| 1 -> 'A'
| 2 -> 'C'
| 3 -> 'M'
| 4 -> 'G'
| 5 -> 'R'
| 6 -> 'S'
| 7 -> 'V'
| 8 -> 'T'
| 9 -> 'W'
| 10 -> 'Y'
| 11 -> 'H'
| 12 -> 'K'
| 13 -> 'D'
| 14 -> 'B'
| 15 -> 'N'
| l -> failwithf "letter not in [0, 15]: %d" l ()
let seq al =
let n = String.length al.seq in
if n = 0 then None
else
let l_seq = l_seq al in
let r = String.make l_seq ' ' in
for i = 0 to n - 1 do
let c = int_of_char al.seq.[i] in
r.[2 * i] <- char_of_seq_code (c lsr 4) ;
if 2 * i + 1 < l_seq then r.[2 * i + 1] <- char_of_seq_code (c land 0xf) ;
done ;
Some r
let qual al =
Sam.parse_qual al.qual
let int32_is_positive =
let mask = Int32.(shift_left one 31) in
fun i -> Int32.(compare (bit_and i mask) zero) = 0
let parse_cstring buf pos =
let rec aux i =
if i < String.length buf then
if buf.[i] = '\000' then return i
else aux (i + 1)
else error_string "Unfinished NULL terminated string"
in
aux pos >>= fun pos' ->
return (String.sub buf ~pos ~len:(pos' - pos), pos' + 1)
let cCsSiIf_size = function
| 'c'
| 'C' -> return 1
| 's' | 'S' -> return 2
| 'i' | 'I'
| 'f' -> return 4
| _ -> error_string "Incorrect numeric optional field type identifier"
let parse_cCsSiIf buf pos typ =
cCsSiIf_size typ >>= fun len ->
check_buf ~buf ~pos ~len >>= fun () ->
match typ with
| 'c' ->
let i = Int32.of_int_exn (Binary_packing.unpack_signed_8 ~buf ~pos) in
return (Sam.optional_field_value_i i, len)
| 'C' ->
let i = Int32.of_int_exn (Binary_packing.unpack_unsigned_8 ~buf ~pos) in
return (Sam.optional_field_value_i i, len)
| 's' ->
let i = Int32.of_int_exn (Binary_packing.unpack_signed_16 ~byte_order:`Little_endian ~buf ~pos) in
return (Sam.optional_field_value_i i, len)
| 'S' ->
let i = Int32.of_int_exn (Binary_packing.unpack_unsigned_16 ~byte_order:`Little_endian ~buf ~pos) in
return (Sam.optional_field_value_i i, len)
| 'i' ->
let i = Binary_packing.unpack_signed_32 ~byte_order:`Little_endian ~buf ~pos in
return (Sam.optional_field_value_i i, len)
| 'I' ->
let i = Binary_packing.unpack_signed_32 ~byte_order:`Little_endian ~buf ~pos in
if int32_is_positive i then return (Sam.optional_field_value_i i, len)
else error_string "Use of big unsigned ints in optional fields is not well-defined in the specification of SAM/BAM files"
| 'f' ->
let f = Binary_packing.unpack_float ~byte_order:`Little_endian ~buf ~pos in
return (Sam.optional_field_value_f f, len)
| _ -> error_string "Incorrect numeric optional field type identifier"
let parse_optional_field_value buf pos = function
| 'A' ->
check_buf ~buf ~pos ~len:1 >>= fun () ->
Sam.optional_field_value_A buf.[pos] >>= fun v ->
return (v, 1)
| 'c' | 'C' | 's' | 'S' | 'i' | 'I' | 'f' as typ ->
parse_cCsSiIf buf pos typ
| 'Z' ->
parse_cstring buf pos >>= fun (s, pos') ->
Sam.optional_field_value_Z s >>= fun value ->
return (value, pos' - pos)
| 'H' ->
parse_cstring buf pos >>= fun (s, pos') ->
Sam.optional_field_value_H s >>= fun value ->
return (value, pos' - pos)
| 'B' ->
check_buf ~buf ~pos ~len:5 >>= fun () ->
let typ = buf.[0] in
let n = Binary_packing.unpack_signed_32 ~buf ~pos:(pos + 1) ~byte_order:`Little_endian in
(
match Int32.to_int n with
| Some n ->
cCsSiIf_size typ >>= fun elt_size ->
check_buf ~buf ~pos:(pos + 5) ~len:(n * elt_size) >>= fun () ->
let elts = List.init n ~f:(fun i -> String.sub buf ~pos:(pos + 5 + i * elt_size) ~len:elt_size) in
let bytes_read = 5 + elt_size * n in
Sam.optional_field_value_B typ elts >>= fun value ->
return (value, bytes_read)
| None ->
error_string "Too many elements in B-type optional field"
)
| c -> error "Incorrect optional field type identifier" c <:sexp_of< char>>
let parse_optional_field buf pos =
check_buf ~buf ~pos ~len:3 >>= fun () ->
let tag = String.sub buf pos 2 in
let field_type = buf.[pos + 2] in
parse_optional_field_value buf (pos + 3) field_type >>= fun (field_value, shift) ->
Sam.optional_field tag field_value >>= fun field ->
return (field, shift + 3)
let parse_optional_fields buf =
let rec loop buf pos accu =
if pos = String.length buf then return (List.rev accu)
else
parse_optional_field buf pos >>= fun (field, used_chars) ->
loop buf (pos + used_chars) (field :: accu)
in
loop buf 0 []
let optional_fields al =
tag (parse_optional_fields al.optional) "Bam.Alignment0.optional_fields"
let decode al header =
flags al >>= fun flags ->
rname al header >>= fun rname ->
cigar al >>= fun cigar ->
rnext al header >>= fun rnext ->
qual al >>= fun qual ->
optional_fields al >>= fun optional_fields ->
Sam.alignment
?qname:(qname al) ~flags ?rname ?pos:(pos al) ?mapq:(mapq al)
~cigar ?rnext ?pnext:(pnext al) ?tlen:(tlen al)
?seq:(seq al) ~qual ~optional_fields
()
let find_ref_id header ref_name =
let open Or_error in
match Array.findi header.ref_seq ~f:(fun _ rs -> rs.Sam.name = ref_name) with
| Some (i, _) -> Ok i
| None -> error_string "Bam: unknown reference id"
let string_of_cigar_ops cigar_ops =
let buf = String.create (List.length cigar_ops * 4) in
let write ith i32 =
let pos = ith * 4 in
Binary_packing.pack_signed_32 ~byte_order:`Little_endian ~buf ~pos i32 in
let open Int32 in
List.iteri cigar_ops ~f:(fun idx op ->
let op_flag, i = match op with
| `Alignment_match i -> 0l, i
| `Insertion i -> 1l, i
| `Deletion i -> 2l, i
| `Skipped i -> 3l, i
| `Soft_clipping i -> 4l, i
| `Hard_clipping i -> 5l, i
| `Padding i -> 6l, i
| `Seq_match i -> 7l, i
| `Seq_mismatch i -> 8l, i
in
write idx (bit_or 0l (of_int_exn (i lsl 4)))
) ;
buf
let sizeof al =
let l_seq = l_seq al in
8 * 4
+ String.length al.read_name + 1
+ String.length al.cigar
+ (l_seq + 1) / 2
+ l_seq
+ String.length al.optional
let reg2bin st ed =
match st, ed with
| b, e when b lsr 14 = e lsr 14 ->
((1 lsl 15) - 1) / 7 + (st lsr 14)
| b, e when b lsr 17 = e lsr 17 ->
((1 lsl 12) - 1) / 7 + (st lsr 17)
| b, e when b lsr 20 = e lsr 20 ->
((1 lsl 9) - 1) / 7 + (st lsr 20)
| b, e when b lsr 23 = e lsr 23 ->
((1 lsl 6) - 1) / 7 + (st lsr 23)
| b, e when b lsr 26 = e lsr 26 ->
((1 lsl 3) - 1) / 7 + (st lsr 26)
| _ -> 0
let char_of_optional_field_value = function
| `A _ -> 'A'
| `i _ -> 'i'
| `f _ -> 'f'
| `Z _ -> 'Z'
| `H _ -> 'H'
| `B _ -> 'B'
let string_of_optional_fields opt_fields =
let rec content = function
| `B (typ, xs) ->
sprintf "%c%s" typ (String.concat ~sep:"" xs)
| `A c -> Char.to_string c
| `f f ->
let bits = Int32.bits_of_float f in
let buf = String.create 4 in
Binary_packing.pack_signed_32
~byte_order:`Little_endian bits ~buf ~pos:0;
buf
| `i i -> (
let buf = String.create 4 in
Binary_packing.pack_signed_32 i
~byte_order:`Little_endian ~buf ~pos:0;
buf
)
| `H s -> (
let r = ref [] in
String.iter s (fun c ->
r := sprintf "%02x" (Char.to_int c) :: !r
) ;
String.concat ~sep:"" (List.rev !r) ^ "\000"
)
| `Z s -> s ^ "\000"
in
List.map opt_fields ~f:(fun opt_field ->
sprintf "%s%c%s"
opt_field.Sam.tag
(char_of_optional_field_value opt_field.Sam.value)
(content opt_field.Sam.value)
)
|> String.concat ~sep:""
let int32 i ~ub var =
if i < ub then
match Int32.of_int i with
| Some i -> return i
| None -> error_string "invalid conversion to int32"
else
errorf "invalid conversion to int32 (%s than %d)" var ub
let encode_bin_mq_nl ~bin ~mapq ~l_read_name =
let open Int32 in
int32 bin ~ub:65536 "bin" >>= fun bin ->
int32 mapq ~ub:256 "mapq" >>= fun mapq ->
int32 l_read_name ~ub:256 "l_read_name" >>= fun l_read_name ->
return (
bit_or
(shift_left bin 16)
(bit_or (shift_left mapq 8) l_read_name)
)
let encode_flag_nc ~flags ~n_cigar_ops =
let open Int32 in
int32 flags ~ub:65536 "flags" >>= fun flags ->
int32 n_cigar_ops ~ub:65536 "n_cigar_ops" >>= fun n_cigar_ops ->
return (bit_or (shift_left flags 16) n_cigar_ops)
let encode al header =
begin match al.Sam.rname with
| Some rname -> find_ref_id header rname
| None -> Ok (-1)
end
>>= fun ref_id ->
let read_name = Option.value ~default:"*" al.Sam.qname in
let seq = Option.value ~default:"*" al.Sam.seq in
let pos = (Option.value ~default:0 al.Sam.pos) - 1 in
let bin = reg2bin pos (pos + String.(length seq)) in
let mapq = Option.value ~default:255 al.Sam.mapq in
let l_read_name = String.length read_name + 1 in
encode_bin_mq_nl ~bin ~mapq ~l_read_name
>>= fun bin_mq_nl ->
let flags = (al.Sam.flags :> int) in
let n_cigar_ops = List.length al.Sam.cigar in
encode_flag_nc ~flags ~n_cigar_ops
>>= fun flag_nc ->
begin match al.Sam.rnext with
| Some `Equal_to_RNAME -> Ok ref_id
| Some (`Value s) -> find_ref_id header s
| None -> Ok (-1)
end
>>= fun next_ref_id ->
let pnext = Option.value ~default:0 al.Sam.pnext - 1 in
let tlen = Option.value ~default:0 al.Sam.tlen in
let cigar = string_of_cigar_ops al.Sam.cigar in
Result.List.map al.Sam.qual ~f:(Biocaml_phred_score.to_char ~offset:`Offset33)
>>| String.of_char_list
>>= fun qual ->
let optional = string_of_optional_fields al.Sam.optional_fields in
Ok {
ref_id; pos; bin_mq_nl; flag_nc; cigar;
next_ref_id; pnext; tlen; seq; qual; optional;
read_name
}
end