datasets.py 10 KB
Newer Older
Yan's avatar
Yan committed
1
2
3
4
5
6
7
8
9
#!/usr/bin/env python3
from rawprasslib import load_raw
from opentimspy.opentims import OpenTIMS
try:
    from rawautoparams import load_params
    autoparams = True
except ImportError:
    autoparams = False
import numpy as np
Yan's avatar
Yan committed
10
11
12
import logging
import os.path
import pathlib
Yan's avatar
Yan committed
13
14
15
import prasopes.config as cf
import prasopes.datatools as dt

Yan's avatar
Yan committed
16

Yan's avatar
Yan committed
17
logger = logging.getLogger('dsLogger')
Yan's avatar
Yan committed
18

Yan's avatar
Yan committed
19

Yan's avatar
Yan committed
20
21
22
23
24
25
def logdecor(fnc):
    logger.info(fnc.__doc__ + "...")
    fnc()
    logger.info("DONE: " + fnc.__doc__)


Yan's avatar
Yan committed
26
27
28
29
30
class Dataset():
    def __init__(self, rawfile):
        self.filename = rawfile
        self.chromatograms = []
        self.dataset = []
Yan's avatar
Yan committed
31
32
        self.headers = None
        self.params = None
Yan's avatar
Yan committed
33
34
        self.mintime = -np.inf
        self.maxtime = np.inf
Yan's avatar
Yan committed
35

Yan's avatar
Yan committed
36
    def get_chromargs(self, mint=None, maxt=None):
Yan's avatar
Yan committed
37
        logger.info("finding correct scans")
Yan's avatar
Yan committed
38
39
        mint = mint or self.mintime
        maxt = maxt or self.maxtime
Yan's avatar
Yan committed
40
        times = dt.argsubselect(np.concatenate(
Yan's avatar
Yan committed
41
             [subset[0] for subset in self.chromatograms]), mint, maxt)
Yan's avatar
Yan committed
42
43
44
45
46
47
48
        args = []
        for subset in self.chromatograms:
            goodtimes = np.where((times < len(subset[0])) & ~(times < 0))[0]
            args.append(times[goodtimes])
            times -= len(subset[0])
        return args

Yan's avatar
Yan committed
49
50
51
52
53
54
55
56
    def refresh(self):
        """implement per-case"""
        return None

    def get_chromatograms(self):
        """implement per-case"""
        raise NotImplementedError

Yan's avatar
Yan committed
57
58
59
60
    def get_peakchrom(self, masstart, massend):
        """implement per-case"""
        raise NotImplementedError

Yan's avatar
Yan committed
61
62
63
    def get_spectra(self, mint=None, maxt=None):
        mint = mint or self.mintime
        maxt = maxt or self.maxtime
Yan's avatar
Yan committed
64
65
66
67
68
69
70
71
72
73
74
        """implement per-case"""
        raise NotImplementedError


class ThermoRawDataset(Dataset):
    def __init__(self, rawfile):
        super().__init__(rawfile)
        self.machtype = []
        self.refresh()

    def refresh(self):
Yan's avatar
Yan committed
75
        self.dataset = load_raw(self.filename,
Yan's avatar
Yan committed
76
77
                                cf.settings().value("tmp_location"))
        self.chromatograms = self.get_chromatograms()
Yan's avatar
Yan committed
78
        self.fullspectra = self.get_spectra()
Yan's avatar
Yan committed
79
        self.mintime, self.maxtime = [self.chromatograms[i][0][i]
Yan's avatar
Yan committed
80
                                      for i in (0, -1)]
Yan's avatar
Yan committed
81
82
83
84
85
        if autoparams:
            try:
                self.params, rawheaders, self.machtype = load_params(
                    self.filename, cf.settings().value("tmp_location"))
                segments = [len(i[0]) for i in self.chromatograms]
Yan's avatar
Yan committed
86
87
                indicies = [sum(segments[:i+1])
                            for i, j in enumerate(segments)]
Yan's avatar
Yan committed
88
                self.headers = np.split(rawheaders, indicies)[:-1]
Yan's avatar
Yan committed
89
            except Exception:
Yan's avatar
Yan committed
90
91
92
93
94
95
                self.params, self.machtype, self.headers = None, None, None

    def get_chromatograms(self):
        if cf.settings().value("view/oddeven", type=bool):
            chroms = []
            for i in self.dataset:
Yan's avatar
Yan committed
96
97
                for j in (0, 1):
                    chroms.append([i[0][ax, :][j::2] for ax in (0, 1)])
Yan's avatar
Yan committed
98
99
100
101
        else:
            chroms = [i[0] for i in self.dataset]
        return chroms

Yan's avatar
Yan committed
102
103
104
105
    def get_spectra(self, mint=None, maxt=None):
        mint = mint or self.mintime
        maxt = maxt or self.maxtime
        args = self.get_chromargs(mint, maxt)
Yan's avatar
Yan committed
106
        spectra = []
Yan's avatar
Yan committed
107
        for i, subset in enumerate(self.dataset):
Yan's avatar
Yan committed
108
            if cf.settings().value("view/oddeven", type=bool):
Yan's avatar
Yan committed
109
                for j in (0, 1):
Yan's avatar
Yan committed
110
111
                    if len(subset[2][args[i*2+j][j::2]]):
                        yvalz = np.mean(subset[2][args[i*2+j][j::2]], axis=0)
Yan's avatar
Yan committed
112
                        spectra.append([subset[1], yvalz])
Yan's avatar
Yan committed
113
                    else:
Yan's avatar
Yan committed
114
                        spectra.append([[], []])
Yan's avatar
Yan committed
115
116
            else:
                if len(subset[2][args[i]]):
Yan's avatar
Yan committed
117
118
                    yvalz = np.mean(subset[2][args[i]], axis=0)
                    spectra.append([subset[1], yvalz])
Yan's avatar
Yan committed
119
                else:
Yan's avatar
Yan committed
120
                    spectra.append([[], []])
Yan's avatar
Yan committed
121
122
        return spectra

Yan's avatar
Yan committed
123
    def get_peakchrom(self, startmass, endmass):
Yan's avatar
Yan committed
124
        intensity = np.concatenate([np.divide(np.sum(subset[2].T[
Yan's avatar
Yan committed
125
            dt.argsubselect(subset[1], startmass, endmass)].T, axis=1),
Yan's avatar
Yan committed
126
127
128
129
            np.clip(subset[0][1], np.finfo(np.float32).eps, None))
            for subset in self.dataset])
        return intensity

Yan's avatar
Yan committed
130
131
132
133
134
135

class BrukerTimsDataset(Dataset):
    def __init__(self, rawfile):
        super().__init__(rawfile)
        self.refresh()

Yan's avatar
Yan committed
136
    @logdecor
Yan's avatar
Yan committed
137
    def refresh(self):
Yan's avatar
Yan committed
138
        """refreshing timsTOF dataset"""
Yan's avatar
Yan committed
139
140
141
        if(os.path.isdir(self.filename)):
            self.dataset = OpenTIMS(pathlib.Path(self.filename))
        else:
Yan's avatar
Yan committed
142
143
            self.dataset = OpenTIMS(
                    pathlib.Path(os.path.dirname(self.filename)))
Yan's avatar
Yan committed
144
        self.chromatograms = self.get_chromatograms()
Yan's avatar
Yan committed
145
        self.fullspectra = self.get_spectra()
Yan's avatar
Yan committed
146
        self.mintime, self.maxtime = [self.chromatograms[i][0][i]
Yan's avatar
Yan committed
147
                                      for i in (0, -1)]
Yan's avatar
Yan committed
148

Yan's avatar
Yan committed
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
    def sampletimes(self, mint, maxt, timescap):
        frames = dt.argsubselect(self.dataset.retention_times,
                                 mint*60, maxt*60) + 1
        framessel = frames if timescap >= len(frames) else np.linspace(
                frames[0], frames[-1], timescap).astype('int')
        return framessel

    def binit(self, x, y, minstep, length):
        sortx = np.sort(x)
        stepsx = sortx[1:] - sortx[:-1]
        binspos = np.where(stepsx > minstep)[0]
        bins = sortx[:-1][binspos] + (stepsx[binspos]/2)
        binpos = np.digitize(x, bins)
        bindx = np.bincount(binpos, x) / np.bincount(binpos)
        bindy = np.bincount(binpos, y) / length
        return bindx, bindy

Yan's avatar
Yan committed
166
    @logdecor
Yan's avatar
Yan committed
167
    def get_chromatograms(self):
Yan's avatar
Yan committed
168
        """getting timsTOF chromatogram"""
Yan's avatar
Yan committed
169
        times = self.dataset.retention_times / 60
Yan's avatar
Yan committed
170
171
172
173
174
175
176
177
178
179
        if cf.settings().value("timstof/fastchrom", type=bool):
            # Thx to Michał Startek, slight deviation, but much faster
            intensities = (self.dataset.frames['SummedIntensities'] *
                           self.dataset.frames['AccumulationTime']) / 100.0
        else:
            # devNote - summing is fast, asarray is fast, iterating is slow.
            intensities = np.asarray([
                np.sum(i['intensity']) for i in self.dataset.query_iter(
                    self.dataset.ms1_frames, columns=('intensity',))])
        logger.info("DONE: getting timsTOF chromatogram")
Yan's avatar
Yan committed
180
        return [[times, intensities]]
Yan's avatar
Yan committed
181

Yan's avatar
Yan committed
182
    @logdecor
Yan's avatar
Yan committed
183
    def get_spectra(self, mint=None, maxt=None):
Yan's avatar
Yan committed
184
        """getting timsTOF spectra"""
Yan's avatar
Yan committed
185
186
        mint = mint or self.mintime
        maxt = maxt or self.maxtime
Yan's avatar
Yan committed
187
188
        framessel = self.sampletimes(mint, maxt, cf.settings().value(
                                         "timstof/ms_sampling", type=int))
Yan's avatar
Yan committed
189
        massints = self.dataset.query(framessel, columns=('mz', 'intensity'))
Yan's avatar
Yan committed
190
191
192
        masses, ints = self.binit(
            massints['mz'], massints['intensity'],
            cf.settings().value("timstof/ms_bins", type=float), len(framessel))
Yan's avatar
Yan committed
193
        return [[masses, ints]]
Yan's avatar
Yan committed
194

Yan's avatar
Yan committed
195
    @logdecor
Yan's avatar
Yan committed
196
    def get_mobspectra(self, mint=None, maxt=None):
Yan's avatar
Yan committed
197
        """getting timsTOF spectra"""
Yan's avatar
Yan committed
198
199
200
201
202
203
204
205
206
207
        mint = mint or self.mintime
        maxt = maxt or self.maxtime
        framessel = self.sampletimes(mint, maxt, cf.settings().value(
                                         "timstof/ms_sampling", type=int))
        massints = self.dataset.query(framessel, columns=('inv_ion_mobility', 'intensity'))
        masses, ints = self.binit(
            massints['inv_ion_mobility'], massints['intensity'],
            cf.settings().value("timstof/mob_bins", type=float), len(framessel))
        return [[masses, ints]]

Yan's avatar
Yan committed
208
    @logdecor
Yan's avatar
Yan committed
209
    def get_peakchrom(self, startm, endm):
Yan's avatar
Yan committed
210
        """getting peak ion chromatogram"""
Yan's avatar
Yan committed
211
212
213
        intensity = np.divide([
            np.sum(i['intensity'][dt.argsubselect(i['mz'], startm, endm)])
            for i in self.dataset.query_iter(
Yan's avatar
Yan committed
214
                np.arange(self.dataset.max_frame)+1, columns=('intensity', 'mz'))],
Yan's avatar
Yan committed
215
216
            np.clip(self.chromatograms[0][1], np.finfo(np.float32).eps, None))
        return intensity
Yan's avatar
Yan committed
217

Yan's avatar
Yan committed
218
    @logdecor
Yan's avatar
Yan committed
219
    def get_mzmobpeakchrom(self, startmz, endmz, startmob, endmob):
Yan's avatar
Yan committed
220
        """getting mz/mob peak ion chromatogram..."""
Yan's avatar
Yan committed
221
222
223
224
225
226
227
228
229
230
231
232
        boundsmz = sorted([startmz, endmz])
        boundsmob = sorted([startmob, endmob])
        intensity = np.divide([
            np.sum(i['intensity'][dt.argsubselect_2d(
                i['mz'], startmz, endmz,
                i['inv_ion_mobility'], startmob, endmob)])
            for i in self.dataset.query_iter(
                np.arange(self.dataset.max_frame)+1, columns=(
                    'intensity', 'mz', 'inv_ion_mobility'))],
            np.clip(self.chromatograms[0][1], np.finfo(np.float32).eps, None))
        return intensity

233
234
    def get_subspace(self, xax, subselect, start, end, bins,
                     mint=None, maxt=None):
Yan's avatar
Yan committed
235
236
        mint = mint or self.mintime
        maxt = maxt or self.maxtime
Yan's avatar
Yan committed
237
238
        msg = ("getting timsTOF for {:.3f}-{:.3f} minutes, subselecting"
                     " {}: {}-{}").format(mint, maxt, subselect, start, end)
Yan's avatar
Yan committed
239
        logger.info(msg+"...")
Yan's avatar
Yan committed
240
241
        framessel = self.sampletimes(mint, maxt, cf.settings().value(
                                         "timstof/mob_sampling", type=int))
Yan's avatar
Yan committed
242
        massintstof = self.dataset.query(
243
244
                framessel, columns=(subselect, 'intensity', xax))
        goodargs = dt.argsubselect(massintstof[subselect], start, end)
Yan's avatar
Yan committed
245
        mobs, ints = self.binit(
246
247
            massintstof[xax][goodargs], massintstof['intensity'][goodargs],
            cf.settings().value("timstof/{}".format(bins), type=float),
Yan's avatar
Yan committed
248
            len(framessel))
Yan's avatar
Yan committed
249
        logger.info("DONE: "+msg)
Yan's avatar
Yan committed
250
        return mobs, ints
251
252
253
254
255
256
257
258
259
260

    def get_mobilogram(self, startm, endm, mint=None, maxt=None):
        x, y = self. get_subspace('inv_ion_mobility', 'mz',
                                  startm, endm, 'mob_bins', mint, maxt)
        return x, y

    def get_ms_onmob(self, startm, endm, mint=None, maxt=None):
        x, y = self.get_subspace('mz', 'inv_ion_mobility',
                                 startm, endm, 'ms_bins', mint, maxt)
        return x, y