diff --git a/dataset_loader.py b/dataset_loader.py index 582af6c..91a6859 100644 --- a/dataset_loader.py +++ b/dataset_loader.py @@ -8,9 +8,14 @@ import soundfile as sf import pandas as pd import glob from tqdm import tqdm +import pathlib -# generator function. It reads the csv file with pandas and loads the largest audio segments from each recording. If extend=False, it will only read the segments with length>length_seg, trim them and yield them with no further processing. Otherwise, if the segment length is inferior, it will extend the length using concatenative synthesis. +# generator function. It reads the csv file with pandas and loads the largest +# audio segments from each recording. If extend=False, it will only read the +# segments with length>length_seg, trim them and yield them with no further +# processing. Otherwise, if the segment length is inferior, it will extend the +# length using concatenative synthesis. def __noise_sample_generator(info_file, fs, length_seq, split): head = os.path.split(info_file)[0] load_data = pd.read_csv(info_file) @@ -24,20 +29,24 @@ def __noise_sample_generator(info_file, fs, length_seq, split): for i in r: segments = ast.literal_eval(load_data_split.loc[i, "segments"]) if split == "test": - loaded_data, Fs = sf.read( - os.path.join( - head, - load_data_split["recording"].loc[i], - load_data_split["largest_segment"].loc[i], - ) + path = os.path.join( + head, + load_data_split["recording"].loc[i], + load_data_split["largest_segment"].loc[i], ) + if not pathlib.Path(path).is_file(): + print(f'WARNING! file does not exist: {path}') + continue + loaded_data, Fs = sf.read(path) else: num = np.random.randint(0, len(segments)) - loaded_data, Fs = sf.read( - os.path.join( - head, load_data_split["recording"].loc[i], segments[num] - ) + path = os.path.join( + head, load_data_split["recording"].loc[i], segments[num] ) + if not pathlib.Path(path).is_file(): + print(f'WARNING! file does not exist: {path}') + continue + loaded_data, Fs = sf.read(path) assert fs == Fs, "wrong sampling rate" yield __extend_sample_by_repeating(loaded_data, fs, length_seq)