Source code for easycv.datasets.classification.data_sources.image_list

# Copyright (c) Alibaba, Inc. and its affiliates.
import logging
import os
import time

from PIL import Image, ImageFile

from easycv.datasets.registry import DATASOURCES
from easycv.file import io
from easycv.utils.dist_utils import dist_zero_exec
from .utils import split_listfile_byrank


[docs]@DATASOURCES.register_module class ClsSourceImageList(object): """ data source for classification Args: list_file : str / list(str), str means a input image list file path, this file contains records as `image_path label` in list_file list(str) means multi image list, each one contains some records as `image_path label` root: str / list(str), root path for image_path, each list_file will need a root, if len(root) < len(list_file), we will use root[-1] to fill root list. delimeter: str, delimeter of each line in the `list_file` split_huge_listfile_byrank: Adapt to the situation that the memory cannot fully load a huge amount of data list. If split, data list will be split to each rank. split_label_balance: if `split_huge_listfile_byrank` is true, whether split with label balance cache_path: if `split_huge_listfile_byrank` is true, cache list_file will be saved to cache_path. max_try: int, max try numbers of reading image """
[docs] def __init__(self, list_file, root='', delimeter=' ', split_huge_listfile_byrank=False, split_label_balance=False, cache_path='data/', max_try=20): ImageFile.LOAD_TRUNCATED_IMAGES = True self.max_try = max_try # DistributedMPSampler need this attr self.has_labels = True if isinstance(list_file, str): assert isinstance(root, str), 'list_file is str, root must be str' list_file = [list_file] root = [root] else: assert isinstance(list_file, list), \ 'list_file should be str or list(str)' root = [root] if isinstance(root, str) else root if not isinstance(root, list): raise ValueError('root must be str or list(str), but get %s' % type(root)) if len(root) < len(list_file): logging.warning( 'len(root) < len(list_file), fill root with root last!') root = root + [root[-1]] * (len(list_file) - len(root)) # TODO: support return list, donot save split file # TODO: support loading list_file that have already been split if split_huge_listfile_byrank: with dist_zero_exec(): list_file = split_listfile_byrank( list_file=list_file, label_balance=split_label_balance, save_path=cache_path) self.fns = [] self.labels = [] for l, r in zip(list_file, root): fns, labels = self.parse_list_file(l, r, delimeter) self.fns += fns self.labels += labels
[docs] @staticmethod def parse_list_file(list_file, root, delimeter): with io.open(list_file, 'r') as f: lines = f.readlines() fns = [] labels = [] for l in lines: splits = l.strip().split(delimeter) fns.append(os.path.join(root, splits[0])) # must be int,other with mmcv collect will crash label = [int(i) for i in splits[1:]] labels.append( label[0]) if len(label) == 1 else labels.append(label) return fns, labels
[docs] def get_length(self): return len(self.fns)
[docs] def get_sample(self, idx): img = None try_idx = 0 while img is None and try_idx < self.max_try: try: img = Image.open(io.open(self.fns[idx], 'rb')) if img.mode != 'RGB': img = img.convert('RGB') except: # frequent access to oss will cause error, sleep can aviod it time.sleep(1) logging.warning('Try read file fault, %s' % self.fns[idx]) img = None try_idx += 1 if img is None: return self.get_sample(idx + 1) label = self.labels[idx] result_dict = {'img': img, 'gt_labels': label} return result_dict