struct
open Result.Monad_infix
let rec next p =
let open Lines.Buffer in
match (next_line p :> string option) with
| None -> `not_ready
| Some "" -> next p
| Some l when String.(is_prefix (strip l) ~prefix:"@") ->
parse_header_line (current_position p) l |> (fun x -> `output x)
| Some l ->
parse_alignment (current_position p) l |> (fun x -> `output x)
let string_to_raw ?filename () =
let name = sprintf "sam_raw_parser:%s" Option.(value ~default:"<>" filename) in
Lines.Transform.make_merge_error ~name ?filename ~next ()
let reference_sequence_to_header rs =
("SN", rs.ref_name)
:: ("LN", Int.to_string rs.ref_length)
:: (List.filter_opt [
Option.map rs.ref_assembly_identifier (fun s -> ("AS", s));
Option.map rs.ref_checksum (fun s -> ("M5", s));
Option.map rs.ref_species (fun s -> ("SP", s));
Option.map rs.ref_uri (fun s -> ("UR", s)); ]
@ rs.ref_unknown)
let reference_sequence_aggregator () =
let refs = ref [] in
let get_line line =
let sn = ref None in
let ln = ref (Error "") in
let asi = ref None in
let m5 = ref None in
let sp = ref None in
let ur = ref None in
let ios s = try Ok (Int.of_string s) with e -> Error s in
let set r v = r := Some v; None in
let ref_unknown =
List.filter_map line (function
| "SN", name -> set sn name
| "LN", l -> ln := ios l; None
| "AS", a -> set asi a
| "M5", m -> set m5 m
| "SP", s -> set sp s
| "UR", u -> set ur u
| other -> Some other) in
match !sn, !ln with
| Some sn, Ok ln ->
refs := {
ref_name = sn;
ref_length = ln;
ref_assembly_identifier = !asi;
ref_checksum = !m5;
ref_species = !sp;
ref_uri = !ur;
ref_unknown } :: !refs;
Ok ()
| None, _ -> Error (`missing_ref_sequence_name line)
| _, Error "" -> Error (`missing_ref_sequence_length line)
| _, Error s -> Error (`wrong_ref_sequence_length line)
in
let finish () =
let deduped =
List.dedup ~compare:(fun a b -> String.compare a.ref_name b.ref_name) !refs in
if List.length deduped <> List.length !refs then
Error (`duplicate_in_reference_sequence_dictionary
(Array.of_list_rev !refs))
else
Ok (Array.of_list_rev !refs)
in
(get_line, finish)
let raw_to_item () :
(raw_item, (item, [> Error.raw_to_item]) Result.t) Biocaml_transform.t =
let name = "sam_item_parser" in
let raw_queue = Dequeue.create () in
let raw_items_count = ref 0 in
let refseq_line, refseq_end = reference_sequence_aggregator () in
let header_finished = ref false in
let ref_dictionary = ref None in
let rec next stopped =
if Dequeue.is_empty raw_queue
then (if stopped then `end_of_stream else `not_ready)
else begin
incr raw_items_count;
begin match Dequeue.dequeue_exn raw_queue `front with
| `comment c when !header_finished ->
`output (Error (`comment_after_end_of_header (!raw_items_count, c)))
| `header c when !header_finished ->
`output (Error (`header_after_end_of_header (!raw_items_count, c)))
| `comment c -> `output (Ok (`comment c))
| `header ("HD", l) ->
if !raw_items_count <> 1
then `output (Error (`header_line_not_first !raw_items_count))
else `output (expand_header_line l)
| `header ("SQ", l) ->
begin match refseq_line l with
| Error e -> `output (Error e)
| Ok () -> next stopped
end
| `header _ as other -> `output (Ok other)
| `alignment a ->
if !header_finished then (
expand_alignment a !ref_dictionary
>>| (fun a -> `alignment a)
|> (fun x -> `output x)
) else begin
header_finished := true;
Dequeue.enqueue raw_queue `front (`alignment a);
begin match refseq_end () with
| Ok rd ->
ref_dictionary := Some rd;
`output (Ok (`reference_sequence_dictionary rd))
| Error e -> `output (Error e)
end
end
end
end
in
Biocaml_transform.make ~name ~feed:(Dequeue.enqueue raw_queue `back) ()
~next
let downgrade_alignment al =
let qname = al.query_template_name in
let flag = al.flags in
let rname =
match al.reference_sequence with
| `none -> "*"
| `name s -> s
| `reference_sequence rs -> rs.ref_name in
let pos = Option.value ~default:0 al.position in
let mapq = Option.value ~default:255 al.mapping_quality in
let cigar =
match al.cigar_operations with
| [| |] -> "*"
| some ->
Array.map some ~f:(function
| `M v -> sprintf "%d%c" v 'M'
| `I v -> sprintf "%d%c" v 'I'
| `D v -> sprintf "%d%c" v 'D'
| `N v -> sprintf "%d%c" v 'N'
| `S v -> sprintf "%d%c" v 'S'
| `H v -> sprintf "%d%c" v 'H'
| `P v -> sprintf "%d%c" v 'P'
| `Eq v -> sprintf "%d%c" v '='
| `X v -> sprintf "%d%c" v 'X')
|> String.concat_array ~sep:"" in
let rnext =
match al.next_reference_sequence with
| `qname -> "=" | `none -> "*" | `name s -> s
| `reference_sequence rs -> rs.ref_name in
let pnext = Option.value ~default:0 al.next_position in
let tlen = Option.value ~default:0 al.template_length in
let seq =
match al.sequence with | `string s -> s | `reference -> "=" | `none -> "*" in
begin
try
Array.map al.quality ~f:(fun q ->
Phred_score.to_char q
|> ok_exn
|> Char.to_string)
|> String.concat_array ~sep:""
|> Result.return
with
e -> Error (`wrong_phred_scores al)
end
>>= fun qual ->
let optional =
let rec optv = function
| `array (t, a) ->
sprintf "B%c%s" t String.(Array.map a ~f:optv |> concat_array ~sep:"" )
| `char c -> Char.to_string c
| `float f -> Float.to_string f
| `int i -> Int.to_string i
| `string s -> s in
let typt = function
| 'i' | 'I' | 's' | 'S' | 'c' | 'C' -> 'i'
| c -> c in
List.map al.optional_content (fun (tag, typ, v) -> (tag, typt typ, optv v))
in
Ok (`alignment {qname; flag; rname; pos;
mapq; cigar; rnext; pnext;
tlen; seq; qual; optional; })
let item_to_raw () =
let name = "sam_item_to_raw" in
let raw_queue = Dequeue.create () in
let raw_items_count = ref 0 in
let reference_sequence_dictionary = ref [| |] in
let reference_sequence_dictionary_to_output = ref 0 in
let rec next stopped =
dbg "raw_items_count: %d refdict-to-out: %d raw_queue: %d "
!raw_items_count
!reference_sequence_dictionary_to_output
(Dequeue.length raw_queue);
begin match !reference_sequence_dictionary_to_output with
| 0 ->
begin match Dequeue.is_empty raw_queue with
| true when stopped -> `end_of_stream
| true -> `not_ready
| false ->
incr raw_items_count;
begin match Dequeue.dequeue_exn raw_queue `front with
| `comment c ->
dbg "comment"; `output (Ok (`comment c))
| `header_line (version, sorting, rest) ->
dbg "header";
`output (Ok (`header ("HD",
("VN", version)
:: ("SO",
match sorting with
| `unknown -> "unknown"
| `unsorted -> "unsorted"
| `queryname -> "queryname"
| `coordinate -> "coordinate")
:: rest)))
| `reference_sequence_dictionary rsd ->
dbg "reference_sequence_dictionary %d" (Array.length rsd);
reference_sequence_dictionary := rsd;
reference_sequence_dictionary_to_output := Array.length rsd;
next stopped
| `header ("SQ", _) ->
dbg "skipping SQ line";
next stopped
| `header h ->
dbg "header %s" (fst h);
`output (Ok (`header h))
| `alignment al ->
dbg "alignment";
downgrade_alignment al |> (fun x -> `output x)
end
end
| n ->
let o =
!reference_sequence_dictionary.(
Array.length !reference_sequence_dictionary - n) in
reference_sequence_dictionary_to_output := n - 1;
`output (Ok (`header ("SQ", reference_sequence_to_header o)))
end
in
Biocaml_transform.make ~name ~feed:(Dequeue.enqueue raw_queue `back) ()
~next
let alignment_to_string x =
sprintf "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n"
x.qname x.flag x.rname x.pos x.mapq x.cigar x.rnext x.pnext x.tlen x.seq x.qual
(List.map x.optional (fun (a,b,c) -> sprintf "%s:%c:%s" a b c) |>
String.concat ~sep:"\t")
let raw_to_string () =
let to_string = function
| `comment c -> sprintf "@CO\t%s\n" c
| `header (t, l) ->
sprintf "@%s\t%s\n" t
(List.map l (fun (a,b) -> sprintf "%s:%s" a b)
|> String.concat ~sep:"\t")
| `alignment a -> alignment_to_string a
in
Biocaml_transform.of_function ~name:"sam_to_string" to_string
end