Commit 6c5f1fd9 authored by Nik Vaessen's avatar Nik Vaessen
Browse files

initial commit

parents
Pipeline #56449 failed with stages
in 1 minute and 56 seconds
.idea
.vscode
venv/
.venv/
*.pyc
dist/
build/
*.egg-info/
.tox/
.coverage
/lightning_logs/
/logs/
/data/
notebooks/.ipynb_checkpoints
notebooks/playground.ipynb
\ No newline at end of file
#! /usr/bin/env python3
################################################################################
#
# Implement the command-line interface for prediction/inference.
#
# Author(s): Nik Vaessen, David van Leeuwen
################################################################################
import argparse
from pathlib import Path
import numpy as np
from tqdm import tqdm
from pyllr.pav_rocch import PAV, ROCCH
from skeleton.data.preprocess import Preprocessor
from skeleton.models.prototype import PrototypeSpeakerRecognitionModule
from skeleton.data.tiny_voxceleb import (
load_evaluation_pairs,
_collate_samples,
init_dataset,
)
from skeleton.evaluation.evaluator import SpeakerRecognitionEvaluator, EmbeddingSample
########################################################################################
# function to compture EER based on list of scores and list of ground truths
def compute_eer(scores: list, labels: list):
scores = np.asarray(scores, dtype=np.float)
labels = np.asarray(labels, dtype=np.float)
pav = PAV(scores, labels)
rocch = ROCCH(pav)
return rocch.EER()
########################################################################################
# CLI input arguments
parser = argparse.ArgumentParser()
# required positional paths to data
parser.add_argument(
"checkpoint",
type=Path,
help="path to checkpoint file",
)
parser.add_argument(
"shards_dirs_to_evaluate",
type=str,
help="paths to shard file containing all audio files in the given trial list (',' separated)",
)
parser.add_argument(
"trial_lists", type=str, help="file with list of trials (',' separated)"
)
parser.add_argument("score_file", type=Path, help="output file to store scores")
# optional arguments based on pre-processing variables
parser.add_argument(
"--normalize_channel_wise",
type=bool,
default=True,
help="whether to normalize each input mel band separately, should be the same as in train",
)
parser.add_argument(
"--n_mels",
type=int,
default=40,
help="number of mel bands to generate, should be the same as in train",
)
################################################################################
# entrypoint of script
def main(
checkpoint_path: Path,
shards_dirs_to_evaluate: str,
trial_lists: str,
score_file: Path,
normalize_channel_wise: bool,
n_mels: int,
):
# load module from checkpoint
model = PrototypeSpeakerRecognitionModule.load_from_checkpoint(str(checkpoint_path))
preprocessor = Preprocessor(
audio_length_seconds=30,
n_mels=n_mels,
normalize_channel_wise=normalize_channel_wise,
normalize=True,
)
print(model)
shards_dirs = shards_dirs_to_evaluate.split(",")
trial_lists = trial_lists.split(",")
pairs = list()
embeddings = list()
# process dev and test simultaneously
for shards_dir, trial_list in zip(shards_dirs, trial_lists):
print("Processing shards dir", shards_dir)
dataset = init_dataset(Path(shards_dir))
dataset = dataset.map(preprocessor.test_data_pipeline).batched(
1, collation_fn=_collate_samples
)
pairs.extend(load_evaluation_pairs(Path(trial_list)))
for x in tqdm(dataset):
assert x.batch_size == 1
embedding, _ = model(x.network_input)
key = x.keys[0]
if key.startswith("unknown"):
key = key.split("/")[
-1
] # for eval, path in shard is unknown/unknown/$key
embeddings.append(EmbeddingSample(key, embedding.squeeze()))
scores = SpeakerRecognitionEvaluator.evaluate(
pairs, embeddings, model.mean_embedding, model.std_embedding, skip_eer=True
)
eer_scores = list()
eer_labels = list()
with open(score_file, "w") as out:
for score, pair in zip(scores, pairs):
print(score, f"{pair.sample1_id}.wav", f"{pair.sample2_id}.wav", file=out)
if pair.same_speaker is not None:
eer_scores.append(score)
eer_labels.append(pair.same_speaker)
if len(eer_scores) > 0:
eer = compute_eer(eer_scores, eer_labels)
print(
f"EER computed over {len(eer_scores)} trials for which truth is known: {eer*100:4.2f}%"
)
if __name__ == "__main__":
args = parser.parse_args()
main(
checkpoint_path=args.checkpoint,
shards_dirs_to_evaluate=args.shards_dirs_to_evaluate,
trial_lists=args.trial_lists,
score_file=args.score_file,
normalize_channel_wise=args.normalize_channel_wise,
n_mels=args.n_mels,
)
#!/usr/bin/env python3
################################################################################
#
# Implement the command-line interface for training a network.
#
# Author(s): Nik Vaessen
################################################################################
import pathlib
import click
import pytorch_lightning
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
from skeleton.data.preprocess import Preprocessor
from skeleton.data.tiny_voxceleb import TinyVoxcelebDataModule
from skeleton.models.prototype import PrototypeSpeakerRecognitionModule
################################################################################
# entrypoint of script
@click.command()
@click.option(
"--shard_folder",
type=pathlib.Path,
required=True,
help="path to root folder containing train, val and dev shard subfolders",
)
@click.option(
"--val_trials_path",
type=pathlib.Path,
required=True,
help="path to .txt file containing val trials",
)
@click.option(
"--dev_trials_path",
type=pathlib.Path,
required=True,
help="path to .txt file containing dev trials",
)
@click.option(
"--batch_size",
type=int,
default=128,
help="batch size to use for train and val split",
)
@click.option(
"--audio_length_seconds",
type=float,
default=3,
help="The length of each audio file during training in seconds",
)
@click.option(
"--normalize_channel_wise",
type=bool,
default=True,
help="whether to normalize each input mel band separately",
)
@click.option("--n_mels", type=int, default=40, help="number of mel bands to generate")
@click.option(
"--embedding_size",
type=int,
default=128,
help="dimensionality of learned speaker embeddings",
)
@click.option(
"--learning_rate",
type=float,
default=3e-3,
help="constant learning rate used during training",
)
@click.option("--epochs", type=int, default=30, help="number of epochs to train for")
@click.option("--gpus", type=int, default=1, help="number of gpus to use")
@click.option("--random_seed", type=int, default=1337, help="the random seed")
def main(
shard_folder: pathlib.Path,
val_trials_path: pathlib.Path,
dev_trials_path: pathlib.Path,
batch_size: int,
audio_length_seconds: float,
normalize_channel_wise: bool,
n_mels: int,
embedding_size: int,
learning_rate: float,
epochs: int,
gpus: int,
random_seed: int,
):
# log input
print("### input arguments ###")
print(f"shard_folder={shard_folder}")
print(f"val_trials_path={val_trials_path}")
print(f"dev_trials_path={dev_trials_path}")
print(f"batch_size={batch_size}")
print(f"audio_length_seconds={audio_length_seconds}")
print(f"normalize_channel_wise={normalize_channel_wise}")
print(f"n_mels={n_mels}")
print(f"embedding_size={embedding_size}")
print(f"learning_rate={learning_rate}")
print(f"epochs={epochs}")
print(f"gpus={gpus}")
print(f"random_seed={random_seed}")
print()
# set random seed
pytorch_lightning.seed_everything(random_seed)
# build data loader
dm = TinyVoxcelebDataModule(
shard_folder=shard_folder,
val_trials_path=val_trials_path,
dev_trials_path=dev_trials_path,
batch_size=batch_size,
preprocessor=Preprocessor(
audio_length_seconds=audio_length_seconds,
n_mels=n_mels,
normalize=True,
normalize_channel_wise=normalize_channel_wise,
),
)
# build model
model = PrototypeSpeakerRecognitionModule(
num_inp_features=n_mels,
num_embedding=embedding_size,
num_speakers=dm.num_speakers,
learning_rate=learning_rate,
val_trials=dm.val_trials,
test_trials=dm.dev_trials,
)
# build trainer
checkpointer = ModelCheckpoint(monitor="val_eer")
trainer = pytorch_lightning.Trainer(
max_epochs=epochs,
gpus=gpus,
callbacks=[checkpointer, LearningRateMonitor()],
default_root_dir="logs"
)
# train loop
trainer.fit(model, datamodule=dm)
# test loop (on dev set)
model = model.load_from_checkpoint(checkpointer.best_model_path)
trainer.test(model, datamodule=dm)
if __name__ == "__main__":
main()
## Use of the educational GPU cluster
The data science group has a small compute cluster for educational use. We are going to use this for the Speaker Recognition Challenge of the course [MLiP 2022](https://brightspace.ru.nl/d2l/home/264128).
The cluster consists of two _compute nodes_, lovingly named `cn47` and `cn48`, and a so-called _head node_, `cn99`. All these machines live in the domain `science.ru.nl`, so the head node's fully qualified name is `cn99.science.ru.nl`.
Both compute nodes have the following specifications:
- 8 Nvidia RTX 2080 Ti GPUs, with 11 GB memory
- 48 Xeon CPUs
- 128 GB memory, shared between the CPUs
- Linux Ubuntu 20.04 operating system
The head node has the same OS installed as the compute nodes, but does not have GPUs, and is not intended for heavy computation. The general idea is that you use the head-node for
- simple editing and file manipulation
- submitting jobs to the compute nodes and controlling these jobs
You need a [science account](https://wiki.cncz.science.ru.nl/Nieuwe_studenten#.5BScience_login_.28vachternaam.29_.5D.5BScience_login_.28isurname.29.5D) in order to be able to log into the cluster.
These nodes are not directly accessible from the internet, in on order to reach these machines you need to either
- use the science.ru [VPN](https://wiki.cncz.science.ru.nl/Vpn)
- you have direct access to cn99, this is somewhat easier with copying through `scp` and `rsync`, remote editing, etc.
- ```
local+vpn$ ssh cn99
```
- login through the machine `lilo.science.ru.nl`.
- You might have to transport files in two steps, only your (small) home filesystem `~` is available in one step.
- ```
local$ ssh lilo.science.ru.nl
lilo7$ ssh cn99
```
Either way, you will be working through a secure-shell connection, so you must have a `ssh` client on your local laptop/computer.
### Understanding the filesystems on the cluster
There are several places where you can store code and data. They have different characteristics:
filesystem | size | speed | scope
-----------|------|-------|------
`~` | 10 GB | fast | shared
/scratch | few T | fastest | local
/ceph/csedu-scratch | several TB | slow | shared
The limitations on the home filesystem, `~` (a.k.a. `$HOME`) are pretty tight---just installing pytorch typically consumes a significant portion of your disk quota. We have a "[cluster preparation" script](../scripts/prepare_cluster.sh) that will set up an environment for you that will give you best experience working on the cluster:
- python packages are installed in a virtual environment
- source data, logs, and models are put on large shared filesystems `/ceph`
- python libraries are copied to all fast local filesystems `/scratch`
- soft-links to these places are made into the project directory
- the project code is available on fast shared filesystems `~`
### Setting up an SSH key in order to clone this repo
First, generate a public/private key pair:
```
$ ssh-keygen
## and hit <return> a few times
```
Then, print your public key in the terminal and copy it to the clipboard:
```
$ cat ~/.ssh/id_rsa.pub
```
**DO NOT** copy `~/.ssh/id_rsa`, this is your private key!
The output should like the following.
```
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDPq1jKbBfuHkcow+octo9lPW3TQTyU/W5CuSKYV4RvZXJlvdkMQj1JpjOJEyCBECaYfr8cDbeitvGaSEJHTeHLjpMRIyCtpieV5I0fn6YJJk6VzeiKTuhgOxWjlhQo6iZtAC8ptY/O88ZCIhIHe8qwcMVc7QAsblqZ2vx5N90m3MfACZLhMgQK4AvT3IF3EbH3Kg9hFwMaJZv+ywNCIvlI/DRGr6zncgng24u62W9dYPTcqKXNQxAPO9vXAue7PC/E9R/Jjffjn56ufpd71iYMbVWY8TYsDFuIhS/oeckNJz/7yoy2X3pq6GPehLBLds86TpyGBMTO6rL+3E1akcSB nvaessen@cn13
```
Now, in your browser, visit [gitlab.science.ru.nl](https://gitlab.science.ru.nl) and:
1. Navigate to your preferences.
![image.png](./images/0.png)
2. Go to the ssh keys tab
![image-1.png](./images/1.png)
3. Paste the public ssh key in the form and click `add key`.
![image-2.png](./images/2.png)
Now, log on to the cluster node `cn99` (through VPN or via `lilo`), and clone this repository:
```bash
$ mkdir mlip
$ cd mlip
$ git clone git@gitlab.science.ru.nl:nvaessen/tiny-voxceleb-skeleton-complete.git
$ cd tiny-voxceleb-skeleton-complete
```
### Setting up links and virtual environments in the cluster
If everything is all right, you have a reasonable clean set of files and directories upon first checkout (we will from now on drop the command prompt `$` in the example code):
```bash
ls
```
Now run the script for setting up the virtual environment and links to various places where data is / will be stored. This script will take several minutes to complete:
```bash
scripts/prepare_cluster.sh
ls -l
```
You will see the soft links made to
- `data`, where the audio data is stored,
- `logs`, where results and log outputs of your scripts are stored
- `venv`, the python virtual environment that has been copied to local discs on cluster nodes `cn47` and `cn48`.
## SLURM
The cluster is an environment where multiple people use computer resources in a co-operative way. Something needs to manage all these resources, and that process is called a _workload manager_. At science.ru we use [SLURM](https://slurm.schedmd.com/documentation.html) to do this, like in many other compute clusters in the world.
Slurm is a clever piece of software, but in the tradition of hard-core computing environments most of the documentation that is available is in plain text "man pages" and inaccessible mailing lists. View the experience as a time machine, going back to the 1970's...
### Running an interactive SLURM session
It is possible to ask for an interactive shell to one of the compute nodes. This will only work smoothly if there is a slot available. If the cluster is "full", jobs will wait until a slot is available, and this may take a while. An interactive session takes up a slot. In this example we will ask for a single GPU, the command `srun` is what makes it all happen, the other commands run inside of the session fired up by `srun`:
```
srun --pty --partition csedu --gres gpu:1 /bin/bash
hostname ## we're on cn47 or cn48
nvidia-smi ## it appears there is 1 GPU available in this machine
exit ## make the slot available again, exit to cn99 again
```
In general, we would advice not to use the interactive shell option, as described here, with a GPU and all, unless you need to just do a quick check in a situation where a GPU is required.
### Queuing slurm jobs
The normal way of working on the cluster is by submitting a batch job. This consists of several components:
- a script (typically bash) that contains all instructions to run the script
- job control information specifying resources that you need for the job
- information on where to store the output (standard out and error)
A job is submitted using `sbatch`, specifying the script as an argument and the other information as options.
As an example, look at [this file](./../experiments/slurm-job.sh), which is a minimalistic script that just gives some information about the environment in which the script runs. You can submit this for running on the cluster using
```bash
sbatch --partition csedu --gres gpu:1 experiments/slurm-job.sh
squeue
```
The `sbatch` will return immediately (unlike the `srun` earlier) and if you were quick enough with typing the `squeue` you might have seen your job either running or being queued in the job queue.
When the job has started, you will find a file named `slurm-$jobid.out` in the current working directory:
```bash
ls slurm-*
```
This is where the standard output of the script is collected.
### More advanced slurm scripts
Having the metadata (`--partiton`, `--gres`, etc) on the command line separate from the script may not always be handy. Therefore SLURM allows the specification of the job metadata _inside_ the script, by using a special `#SBATCH` syntax. For bash (and most other script languages) the `#` starts a comment, so it has no meaning to the script itself.
A full example is in [the skeleton training script](./../experiments/experiment_1_cluster.sh). Inspect the top of this script, it contains tons of instructions for `sbatch`.
This skeleton training script is written in a "relative paths" style, assuming you will submit the job while your current working directory is the root of this repository, i.e., trough calling `sbatch experiments/experiment_1_cluster.sh`. E.g., the logfiles are indicated as `./logs/slurm/%J.out`, the `./logs` refers to the link you've made above setting up the virtual environment. In this way we don't have to put "hard paths" in the script, which would include your user-specific installation directory, and the script will work for every user.
The following `#SBATCH` options are in this example:
- `--partition=csedu`: specifying the subset of all science.ru nodes, we will always be using `csedu`, referring to `cn47` and `cn48`.
- `--gres=gpu:1`: we want one GPU
- `--mem=10G`: we think the job will not use more than 10GB of CPU memory
- `--cpus-per-task=6`: we want to claim 6 CPUs for this task (mainly for the dataloaders)
- `--time=6:00:00`: we expect the training to be finished well before 6 hours (wall clock) time. SLURM will terminate the job if it takes longer...
- `--output=./logs/slurm/%J.out`: The place were the stdout is collected. `%J` refers to the job ID.
- `--error=./logs/slurm/%J.err`: This is where stderr is collected
- `--mail-type=BEGIN,END,FAIL`: specify that we want a mail message sent to our science account email at the start and finish, and in case of a failed job.
When you are ready for it, you can run your first [skeleton speaker recognition](./skeleton.md) training job. The options in the command-line training script are explained [here](./skeleton.md), here we will show you how to submit the job in slurm. Beware: completing the training takes several hours, even with this [minimalistic neural network](../skeleton/models/prototype.py#L124-126).
```bash
sbatch experiments/experiment_1_cluster.sh
```
You can now inspect the status of your job using `squeue`, and watch the training progressing slowly using `tail -f jobs/slurm/$jobid.out`
### (optional) remote modifications with lsyncd
TODO
## Leaderboard
TODO
## Honour code
In order to keep this project fun but also educational, we ask you to respect the following rules:
1. You will not develop or publish your code in a publicly-accessible repository (or share it in another manner with peers outside your group)
2. You will not use a readily-accessible pre-trained network for speaker recognition as your solution.
3. You will not try to find out the identities of the speakers in the evaluation trials from other resources
4. You will not blindly copy code from the internet. You should aim to develop a solution yourself instead of using a code or a library from a third-party. Reading open-source code for inspiration is always allowed!
5. You will not launch more than one job on the SLURM cluster at the same time (as a group) and each job will time-out after at most 12 hours.
6. You will not try to cheat the system, e.g., by hacking into the submission server or by doing digital forensics on files and repositories
This diff is collapsed.
## Project details
You will be given a small subset of the Voxceleb dataset. This dataset contains 110 unique speakers, and each speaker has one or more audio recordings extracted from youtube videos. Based on this data, your task will be to implement a system which learns to distinguish whether two audio recordings belong to the same speaker or to different speakers.
In order to evaluate the system an evaluation set containing audio recordings of a number of *never-seen-before* speakers will be used at the end of the project. You will be given a list of trials between two audio recordings of these speakers, and for each trial your model should give a score that is lower, e.g., tending to 0, if the speakers are different and higher, e.g., tending to 1, if the speakers are equal.
The data provided to you is split up as follows:
* train: audio files from 100 speakers. 50 male, 50 female. The numbers of audio files and number of recording sources differ per speaker.
* val: audio files from 2 recording sources held-out from the training split, for all 100 training speakers. A trial list of length 10_000 over these audio files.
* dev: audio recordings from 10 'unseen' speakers in the train/val split. 5 male, 5 female. A trial list of length 10_000 over these audio files.
* eval: unlabeled audio recordings from unseen speakers which are only used for at the end of the project for a definitive evaluation. A trial list of length 37611 over these audio files.
## Where to find the data
You can readily access the data on the cluster. It's located at `/ceph/csedu-scratch/course/IMC030_MLIP/data` and structured as follows:
```
/ceph/csedu-scratch/course/IMC030_MLIP/data/
├── tiny-voxceleb
│ ├── dev
│ ├── eval
│ ├── train
│ ├── val
│ ├── dev_trials.txt
│ ├── eval_trials_no_gt.txt
│ ├── tiny_meta.csv
│ └── val_trials.txt
├── tiny-voxceleb-shards
│ ├── dev
│ ├── eval
│ ├── train
│ └── val
└── data.zip
```
The `tiny-voxcceleb` subfolders contains all audio files in the respective train, val, dev, and eval sets as separate `.wav` files.
It also contains the trials for the validation and dev sets (with ground-truth labels), as well as the trials for the eval set (without ground-truth labels).
The `tiny_meta.csv` file contains some meta information about each speaker (gender, nationality, and amount of data)
The `tiny-voxceleb-shards` folder contains the respective audio files in `.tar` files, according to the [webdataset](https://github.com/webdataset/webdataset). This is required for training on the cluster as randomly-accessing a lot of small files on a network-attached storage system is very slow.
If you want to play with the data on the local computer, or do some training on your own computer, you can use `scripts/download_data.sh` to download it the zip file to your local `$PROJECT_DIR/data` folder, where you can then extract it.
The `data.zip` file contains the two folders `tiny-voxceleb` and `tiny-voxceleb-shards`.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment