struct
let string_to_raw_item ~string_to_partial_sequence
?filename ?(tags=Tags.char_sequence_default) () =
let name = sprintf "fasta_parser:%s" Option.(value ~default:"<>" filename) in
let chopl s = String.(sub s ~pos:1 ~len:(length s - 1)) in
let open Lines.Buffer in
let buffer = make ?filename () in
let feed = feed_string buffer in
let rec next stopped = match (next_line buffer :> string option) with
| None -> if stopped then `end_of_stream else `not_ready
| Some line ->
let open Tags in
if String.for_all line ~f:Char.is_whitespace then
if tags.forbid_empty_lines then
output_error (`empty_line (current_position buffer))
else
next stopped
else if tags.sharp_comments && line.[0] = '#' then
output_ok (`comment (chopl line))
else if tags.semicolon_comments && line.[0] = ';' then
output_ok (`comment (chopl line))
else if line.[0] = '>' then
output_ok (`header (chopl line))
else
string_to_partial_sequence ~pos:(current_position buffer) line
in
Biocaml_transform.make ~name ~feed ~next ()
let string_to_char_seq_raw_item
?filename ?(tags=Tags.char_sequence_default) () =
let check_alphabet ~pos s = match tags.Tags.sequence with
| `int_sequence
| `char_sequence None ->
output_ok (`partial_sequence s)
| `char_sequence (Some alphabet) ->
if String.for_all s ~f:(List.mem alphabet) then
output_ok (`partial_sequence s)
else
output_error (`malformed_partial_sequence (pos,s))
in
let string_to_partial_sequence ~pos s =
match tags.Tags.max_items_per_line with
| Some n ->
if String.length s > n then
output_error (`sequence_is_too_long (pos,s))
else
check_alphabet ~pos s
| None -> check_alphabet ~pos s
in
string_to_raw_item
~string_to_partial_sequence
?filename
~tags
()
let string_to_int_seq_raw_item
?filename ?(tags=Tags.char_sequence_default) () =
let string_to_partial_sequence ~pos s =
try
let il = List.filter_map (String.split ~on:' ' s) ~f:(function
| "" -> None
| s -> Some (Int.of_string s))
in
match tags.Tags.max_items_per_line with
| Some n ->
if List.length il > n then
output_error (`sequence_is_too_long (pos,s))
else
output_ok (`partial_sequence il)
| None -> output_ok (`partial_sequence il)
with _ -> output_error (`malformed_partial_sequence (pos,s))
in
string_to_raw_item
~string_to_partial_sequence
?filename
~tags
()
let raw_item_to_string_pure ?comment_char alpha_to_string =
function
| `comment c ->
Option.value_map comment_char
~default:"" ~f:(fun o -> sprintf "%c%s\n" o c)
| `header n -> ">" ^ n ^ "\n"
| `partial_sequence s -> (alpha_to_string s) ^ "\n"
let generic_printer ~to_string ~tags () =
let comment_char = Tags.comment_char tags in
Biocaml_transform.of_function
(raw_item_to_string_pure ?comment_char to_string)
let char_seq_raw_item_to_string ?(tags=Tags.char_sequence_default) =
generic_printer ~to_string:ident ~tags
let int_seq_to_string_pure = fun l ->
String.concat ~sep:" " (List.map l Int.to_string)
let int_seq_raw_item_to_string ?(tags=Tags.int_sequence_default) =
generic_printer ~to_string:int_seq_to_string_pure ~tags
let generic_aggregator ~flush ~add ~is_empty ~unnamed_sequence () =
let current_name = ref None in
let result = Queue.create () in
Biocaml_transform.make ~name:"fasta_aggregator" ()
~feed:(function
| `header n ->
Queue.enqueue result (!current_name, flush ());
current_name := Some n;
| `partial_sequence s -> add s
| `comment c -> ())
~next:(fun stopped ->
match Queue.dequeue result with
| None ->
if stopped
then
begin match !current_name with
| None -> `end_of_stream
| Some name ->
current_name := None;
output_ok {header=name; sequence=flush ()}
end
else `not_ready
| Some (None, stuff) when is_empty stuff -> `not_ready
| Some (None, non_empty) ->
output_error (unnamed_sequence non_empty)
| Some (Some name, seq) ->
output_ok {header=name; sequence=seq})
let char_seq_raw_item_to_item () =
let current_sequence = Buffer.create 42 in
generic_aggregator
~flush:(fun () ->
let s = Buffer.contents current_sequence in
Buffer.clear current_sequence;
s)
~add:(fun s -> Buffer.add_string current_sequence s)
~is_empty:(fun s -> s = "")
~unnamed_sequence:(fun x -> `unnamed_char_seq x)
()
let int_seq_raw_item_to_item () =
let scores = Queue.create () in
generic_aggregator
~flush:(fun () ->
let l = Queue.to_list scores in
Queue.clear scores;
List.concat l)
~add:(fun l -> Queue.enqueue scores l)
~is_empty:((=) [])
~unnamed_sequence:(fun x -> `unnamed_int_seq x)
()
let char_seq_item_to_raw_item ?(tags=Tags.char_sequence_default) () =
let items_per_line = Tags.max_items_per_line tags in
let queue = Queue.create () in
Biocaml_transform.make ~name:"fasta_slicer" ()
~feed:(fun {header=hdr; sequence=seq} ->
Queue.enqueue queue (`header hdr);
let rec loop idx =
if idx + items_per_line >= String.length seq then (
Queue.enqueue queue
(`partial_sequence String.(sub seq idx (length seq - idx)));
) else (
Queue.enqueue queue
(`partial_sequence String.(sub seq idx items_per_line));
loop (idx + items_per_line);
) in
loop 0)
~next:(fun stopped ->
match Queue.dequeue queue with
| Some s -> `output s
| None -> if stopped then `end_of_stream else `not_ready)
let int_seq_item_to_raw_item ?(tags=Tags.int_sequence_default) () =
let items_per_line = Tags.max_items_per_line tags in
let queue = Queue.create () in
Biocaml_transform.make ~name:"fasta_slicer" ()
~feed:(fun {header=hdr; sequence=seq} ->
Queue.enqueue queue (`header hdr);
let rec loop l =
match List.split_n l items_per_line with
| finish, [] ->
Queue.enqueue queue (`partial_sequence finish);
| some, rest ->
Queue.enqueue queue (`partial_sequence some);
loop rest
in
loop seq)
~next:(fun stopped ->
match Queue.dequeue queue with
| Some s -> `output s
| None -> if stopped then `end_of_stream else `not_ready)
end