datasets.py 8.93 KB
Newer Older
Yan's avatar
Yan committed
1
2
3
4
5
6
7
8
9
#!/usr/bin/env python3
from rawprasslib import load_raw
from opentimspy.opentims import OpenTIMS
try:
    from rawautoparams import load_params
    autoparams = True
except ImportError:
    autoparams = False
import numpy as np
Yan's avatar
Yan committed
10
11
12
import logging
import os.path
import pathlib
Yan's avatar
Yan committed
13
14
15
import prasopes.config as cf
import prasopes.datatools as dt

Yan's avatar
Yan committed
16

Yan's avatar
Yan committed
17
logger = logging.getLogger('dsLogger')
Yan's avatar
Yan committed
18

Yan's avatar
Yan committed
19

Yan's avatar
Yan committed
20
21
22
23
24
class Dataset():
    def __init__(self, rawfile):
        self.filename = rawfile
        self.chromatograms = []
        self.dataset = []
Yan's avatar
Yan committed
25
26
        self.headers = None
        self.params = None
Yan's avatar
Yan committed
27
28
        self.mintime = -np.inf
        self.maxtime = np.inf
Yan's avatar
Yan committed
29

Yan's avatar
Yan committed
30
    def get_chromargs(self, mint=None, maxt=None):
Yan's avatar
Yan committed
31
        logger.info("finding correct scans")
Yan's avatar
Yan committed
32
33
        mint = mint or self.mintime
        maxt = maxt or self.maxtime
Yan's avatar
Yan committed
34
        times = dt.argsubselect(np.concatenate(
Yan's avatar
Yan committed
35
             [subset[0] for subset in self.chromatograms]), mint, maxt)
Yan's avatar
Yan committed
36
37
38
39
40
41
42
        args = []
        for subset in self.chromatograms:
            goodtimes = np.where((times < len(subset[0])) & ~(times < 0))[0]
            args.append(times[goodtimes])
            times -= len(subset[0])
        return args

Yan's avatar
Yan committed
43
44
45
46
47
48
49
50
    def refresh(self):
        """implement per-case"""
        return None

    def get_chromatograms(self):
        """implement per-case"""
        raise NotImplementedError

Yan's avatar
Yan committed
51
52
53
54
    def get_peakchrom(self, masstart, massend):
        """implement per-case"""
        raise NotImplementedError

Yan's avatar
Yan committed
55
56
57
    def get_spectra(self, mint=None, maxt=None):
        mint = mint or self.mintime
        maxt = maxt or self.maxtime
Yan's avatar
Yan committed
58
59
60
61
62
63
64
65
66
67
68
        """implement per-case"""
        raise NotImplementedError


class ThermoRawDataset(Dataset):
    def __init__(self, rawfile):
        super().__init__(rawfile)
        self.machtype = []
        self.refresh()

    def refresh(self):
Yan's avatar
Yan committed
69
        self.dataset = load_raw(self.filename,
Yan's avatar
Yan committed
70
71
                                cf.settings().value("tmp_location"))
        self.chromatograms = self.get_chromatograms()
Yan's avatar
Yan committed
72
        self.mintime, self.maxtime = [self.chromatograms[i][0][i]
Yan's avatar
Yan committed
73
                                      for i in (0, -1)]
Yan's avatar
Yan committed
74
75
76
77
78
        if autoparams:
            try:
                self.params, rawheaders, self.machtype = load_params(
                    self.filename, cf.settings().value("tmp_location"))
                segments = [len(i[0]) for i in self.chromatograms]
Yan's avatar
Yan committed
79
80
                indicies = [sum(segments[:i+1])
                            for i, j in enumerate(segments)]
Yan's avatar
Yan committed
81
                self.headers = np.split(rawheaders, indicies)[:-1]
Yan's avatar
Yan committed
82
            except Exception:
Yan's avatar
Yan committed
83
84
85
86
87
88
                self.params, self.machtype, self.headers = None, None, None

    def get_chromatograms(self):
        if cf.settings().value("view/oddeven", type=bool):
            chroms = []
            for i in self.dataset:
Yan's avatar
Yan committed
89
90
                for j in (0, 1):
                    chroms.append([i[0][ax, :][j::2] for ax in (0, 1)])
Yan's avatar
Yan committed
91
92
93
94
        else:
            chroms = [i[0] for i in self.dataset]
        return chroms

Yan's avatar
Yan committed
95
96
97
98
    def get_spectra(self, mint=None, maxt=None):
        mint = mint or self.mintime
        maxt = maxt or self.maxtime
        args = self.get_chromargs(mint, maxt)
Yan's avatar
Yan committed
99
        spectra = []
Yan's avatar
Yan committed
100
        for i, subset in enumerate(self.dataset):
Yan's avatar
Yan committed
101
            if cf.settings().value("view/oddeven", type=bool):
Yan's avatar
Yan committed
102
                for j in (0, 1):
Yan's avatar
Yan committed
103
104
                    if len(subset[2][args[i*2+j][j::2]]):
                        yvalz = np.mean(subset[2][args[i*2+j][j::2]], axis=0)
Yan's avatar
Yan committed
105
                        spectra.append([subset[1], yvalz])
Yan's avatar
Yan committed
106
                    else:
Yan's avatar
Yan committed
107
                        spectra.append([[], []])
Yan's avatar
Yan committed
108
109
            else:
                if len(subset[2][args[i]]):
Yan's avatar
Yan committed
110
111
                    yvalz = np.mean(subset[2][args[i]], axis=0)
                    spectra.append([subset[1], yvalz])
Yan's avatar
Yan committed
112
                else:
Yan's avatar
Yan committed
113
                    spectra.append([[], []])
Yan's avatar
Yan committed
114
115
        return spectra

Yan's avatar
Yan committed
116
    def get_peakchrom(self, startmass, endmass):
Yan's avatar
Yan committed
117
        intensity = np.concatenate([np.divide(np.sum(subset[2].T[
Yan's avatar
Yan committed
118
            dt.argsubselect(subset[1], startmass, endmass)].T, axis=1),
Yan's avatar
Yan committed
119
120
121
122
            np.clip(subset[0][1], np.finfo(np.float32).eps, None))
            for subset in self.dataset])
        return intensity

Yan's avatar
Yan committed
123
124
125
126
127
128
129

class BrukerTimsDataset(Dataset):
    def __init__(self, rawfile):
        super().__init__(rawfile)
        self.refresh()

    def refresh(self):
Yan's avatar
Yan committed
130
        logger.info("refreshing timsTOF dataset")
Yan's avatar
Yan committed
131
132
133
        if(os.path.isdir(self.filename)):
            self.dataset = OpenTIMS(pathlib.Path(self.filename))
        else:
Yan's avatar
Yan committed
134
135
            self.dataset = OpenTIMS(
                    pathlib.Path(os.path.dirname(self.filename)))
Yan's avatar
Yan committed
136
        self.chromatograms = self.get_chromatograms()
Yan's avatar
Yan committed
137
        self.mintime, self.maxtime = [self.chromatograms[i][0][i]
Yan's avatar
Yan committed
138
                                      for i in (0, -1)]
Yan's avatar
Yan committed
139

Yan's avatar
Yan committed
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
    def sampletimes(self, mint, maxt, timescap):
        frames = dt.argsubselect(self.dataset.retention_times,
                                 mint*60, maxt*60) + 1
        framessel = frames if timescap >= len(frames) else np.linspace(
                frames[0], frames[-1], timescap).astype('int')
        return framessel

    def binit(self, x, y, minstep, length):
        sortx = np.sort(x)
        stepsx = sortx[1:] - sortx[:-1]
        binspos = np.where(stepsx > minstep)[0]
        bins = sortx[:-1][binspos] + (stepsx[binspos]/2)
        binpos = np.digitize(x, bins)
        bindx = np.bincount(binpos, x) / np.bincount(binpos)
        bindy = np.bincount(binpos, y) / length
        return bindx, bindy

Yan's avatar
Yan committed
157
    def get_chromatograms(self):
Yan's avatar
Yan committed
158
        logger.info("getting timsTOF chromatogram")
Yan's avatar
Yan committed
159
        times = self.dataset.retention_times / 60
Yan's avatar
Yan committed
160
        # devNote - summing is fast, asarray is fast, iterating is slow.
Yan's avatar
Yan committed
161
162
        intensities = np.asarray([
            np.sum(i['intensity']) for i in self.dataset.query_iter(
Yan's avatar
Yan committed
163
164
                self.dataset.ms1_frames, columns=('intensity',))])
        return [[times, intensities]]
Yan's avatar
Yan committed
165

Yan's avatar
Yan committed
166
    def get_spectra(self, mint=None, maxt=None):
Yan's avatar
Yan committed
167
        logger.info("getting timsTOF spectra")
Yan's avatar
Yan committed
168
169
        mint = mint or self.mintime
        maxt = maxt or self.maxtime
Yan's avatar
Yan committed
170
171
        framessel = self.sampletimes(mint, maxt, cf.settings().value(
                                         "timstof/ms_sampling", type=int))
Yan's avatar
Yan committed
172
        massints = self.dataset.query(framessel, columns=('mz', 'intensity'))
Yan's avatar
Yan committed
173
174
175
        masses, ints = self.binit(
            massints['mz'], massints['intensity'],
            cf.settings().value("timstof/ms_bins", type=float), len(framessel))
Yan's avatar
Yan committed
176
        return [[masses, ints]]
Yan's avatar
Yan committed
177

Yan's avatar
Yan committed
178
179
180
181
182
183
184
185
186
187
188
189
    def get_mobspectra(self, mint=None, maxt=None):
        logger.info("getting timsTOF spectra")
        mint = mint or self.mintime
        maxt = maxt or self.maxtime
        framessel = self.sampletimes(mint, maxt, cf.settings().value(
                                         "timstof/ms_sampling", type=int))
        massints = self.dataset.query(framessel, columns=('inv_ion_mobility', 'intensity'))
        masses, ints = self.binit(
            massints['inv_ion_mobility'], massints['intensity'],
            cf.settings().value("timstof/mob_bins", type=float), len(framessel))
        return [[masses, ints]]

Yan's avatar
Yan committed
190
    def get_peakchrom(self, startm, endm):
Yan's avatar
Yan committed
191
        logger.info("getting peak ion chromatogram")
Yan's avatar
Yan committed
192
193
194
        intensity = np.divide([
            np.sum(i['intensity'][dt.argsubselect(i['mz'], startm, endm)])
            for i in self.dataset.query_iter(
Yan's avatar
Yan committed
195
                self.dataset.ms1_frames, columns=('intensity', 'mz'))],
Yan's avatar
Yan committed
196
197
            np.clip(self.chromatograms[0][1], np.finfo(np.float32).eps, None))
        return intensity
Yan's avatar
Yan committed
198
199
200
201
202

    def get_mobilogram(self, startm, endm, mint=None, maxt=None):
        logger.info("getting timsTOF mobilogram")
        mint = mint or self.mintime
        maxt = maxt or self.maxtime
Yan's avatar
Yan committed
203
204
        framessel = self.sampletimes(mint, maxt, cf.settings().value(
                                         "timstof/mob_sampling", type=int))
Yan's avatar
Yan committed
205
206
207
208
209
210
211
212
213
        massintstof = self.dataset.query(
                framessel, columns=('mz', 'intensity', 'inv_ion_mobility'))
        goodargs = dt.argsubselect(massintstof['mz'], startm, endm)
        mobs, ints = self.binit(
            massintstof['inv_ion_mobility'][goodargs],
            massintstof['intensity'][goodargs],
            cf.settings().value("timstof/mob_bins", type=float),
            len(framessel))
        return mobs, ints
Yan's avatar
Yan committed
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230

    def get_ms_onmob(self, startmob, endmob, mint=None, maxt=None):
        logger.info("getting timsTOF mobilogram")
        mint = mint or self.mintime
        maxt = maxt or self.maxtime
        framessel = self.sampletimes(mint, maxt, cf.settings().value(
                                         "timstof/mob_sampling", type=int))
        massintstof = self.dataset.query(
                framessel, columns=('mz', 'intensity', 'inv_ion_mobility'))
        goodargs = dt.argsubselect(massintstof['inv_ion_mobility'], 
                                   startmob, endmob)
        mz, ints = self.binit(
            massintstof['mz'][goodargs],
            massintstof['intensity'][goodargs],
            cf.settings().value("timstof/ms_bins", type=float),
            len(framessel))
        return mz, ints