open Core.Std
open Bigarray
type vec = (float, float64_elt, fortran_layout) Array1.t
type int_vec = (int, int_elt, fortran_layout) Array1.t
module Base64 = struct
external init : unit -> unit = "biocaml_base64_init"
let () = init ()
external little32 :
string -> npeaks:int -> (float, float64_elt, _) Array1.t -> unit
= "biocaml_base64_little32"
external big32 :
string -> npeaks:int -> (float, float64_elt, _) Array1.t -> unit
= "biocaml_base64_big32"
external little64 :
string -> npeaks:int -> (float, float64_elt, _) Array1.t -> unit
= "biocaml_base64_little64"
external big64 :
string -> npeaks:int -> (float, float64_elt, _) Array1.t -> unit
= "biocaml_base64_big64"
let decode ~precision ~little_endian s =
if precision = 32 then
let npeaks = (String.length s / 4) * 3 / 4 in
let v = Array1.create float64 fortran_layout npeaks in
if little_endian then little32 s ~npeaks v else big32 s ~npeaks v;
v
else if precision = 64 then
let npeaks = (String.length s / 4) * 3 / 8 in
let v = Array1.create float64 fortran_layout npeaks in
if little_endian then little64 s ~npeaks v else big64 s ~npeaks v;
v
else invalid_arg "MzData: <peak> precision must be 32 or 64"
end
let rec skip_tag_loop xml depth =
match Xmlm.input xml with
| `El_start _ -> skip_tag_loop xml (depth + 1)
| `El_end -> if depth > 0 then skip_tag_loop xml (depth - 1)
| `Data _ | `Dtd _ -> skip_tag_loop xml depth
let skip_tag xml = skip_tag_loop xml 0
let rec get_next_data xml =
match Xmlm.input xml with
| `Data s ->
s
| `El_start _ ->
skip_tag xml;
get_next_data xml
| `El_end ->
failwith "MzData.spectrums: got tag while looking for XML data"
| _ -> get_next_data xml
let rec return_on_end_tag xml v =
match Xmlm.input xml with
| `El_end -> v
| `El_start _ -> skip_tag xml; return_on_end_tag xml v
| _ -> return_on_end_tag xml v
let rec attribute_exn name = function
| [] -> failwith "MzData.spectrums: attribute not found"
| ((_, n), v) :: tl -> if n = name then v else attribute_exn name tl
let rec attribute name = function
| [] -> None
| ((_, n), v) :: tl -> if n = name then Some v else attribute name tl
module Precursor = struct
type t = {
mslevel: int;
mz: float;
z: float;
int: float;
}
let mass_proton = 1.00727646677
let mass p = (p.mz -. mass_proton) *. p.z
let rec get_ionSelection xml p depth =
match Xmlm.input xml with
| `El_start((_, "cvParam"), attr) ->
let depth = depth + 1 in
let name = attribute_exn "name" attr in
let value = attribute_exn "value" attr in
if name = "MassToChargeRatio" then
get_ionSelection xml { p with mz = Float.of_string value } depth
else if name = "ChargeState" then
get_ionSelection xml { p with z = Float.of_string value } depth
else if name = "Intensity" then
get_ionSelection xml { p with int = Float.of_string value } depth
else
get_ionSelection xml p depth
| `El_start _ -> get_ionSelection xml p (depth + 1)
| `El_end -> if depth = 0 then p
else get_ionSelection xml p (depth - 1)
| `Data _ | `Dtd _ -> get_ionSelection xml p depth
let rec get_precursor xml p =
match Xmlm.input xml with
| `El_start((_, "ionSelection"), _) -> get_ionSelection xml p 0
| `El_start _ -> skip_tag xml; get_precursor xml p
| `El_end -> p
| `Data _ | `Dtd _ -> get_precursor xml p
let rec add_list xml pl =
match Xmlm.input xml with
| `El_start((_, "precursor"), attr) ->
let mslevel = int_of_string(attribute_exn "msLevel" attr) in
let p = get_precursor xml { mslevel; mz = Float.nan; z = Float.nan; int = Float.nan } in
add_list xml (p :: pl)
| `El_start _ -> skip_tag xml; add_list xml pl
| `El_end -> pl
| `Data _ | `Dtd _ -> add_list xml pl
let list xml = add_list xml []
end
type spectrum = {
id: int;
mslevel: int;
precursor: Precursor.t list;
mz: vec;
int: vec;
sup: (string * vec) list;
}
let rec vec_of_binary_data xml =
match Xmlm.input xml with
| `El_start((_, "data"), atts) ->
let precision = int_of_string(attribute_exn "precision" atts) in
let length = int_of_string(attribute_exn "length" atts) in
let little_endian = attribute_exn "endian" atts = "little" in
let data = get_next_data xml in
let v = Base64.decode ~precision ~little_endian data in
if Array1.dim v <> length then
failwith(sprintf "MzData: Invalid XML: <data> expected length: %i, got: %i" length (Array1.dim v));
return_on_end_tag xml v
| _ -> vec_of_binary_data xml
let rec get_spectrum xml spec depth =
match Xmlm.input xml with
| `El_start((_, "spectrumInstrument"), atts) ->
let mslevel = int_of_string(attribute_exn "msLevel" atts) in
let spec = { spec with mslevel } in
get_spectrum xml spec (depth + 1)
| `El_start((_, "precursorList"), _) ->
let spec = { spec with precursor = Precursor.list xml } in
get_spectrum xml spec (depth + 1)
| `El_start((_, "mzArrayBinary"), _) ->
let spec = { spec with mz = vec_of_binary_data xml } in
get_spectrum xml spec (depth + 1)
| `El_start((_, "intenArrayBinary"), _) ->
let spec = { spec with int = vec_of_binary_data xml } in
get_spectrum xml spec (depth + 1)
| `El_start _ -> get_spectrum xml spec (depth + 1)
| `El_end ->
if depth = 0 then spec else get_spectrum xml spec (depth - 1)
| _ -> get_spectrum xml spec depth
let empty_vec = Array1.create float64 fortran_layout 0
let of_file fname =
let fh = open_in fname in
let xml = Xmlm.make_input ~enc:(Some `UTF_8) (`Channel fh) in
let scans = ref [] in
while not(Xmlm.eoi xml) do
match Xmlm.input xml with
| `El_start((_, "spectrum"), atts) ->
let id = int_of_string(attribute_exn "id" atts) in
let scan = { id; mslevel = 0; precursor = [];
mz = empty_vec; int = empty_vec; sup = [] } in
let scan = get_spectrum xml scan 0 in
scans := scan :: !scans
| _ -> ()
done;
!scans