Source code for pype.binfmisc

from pype.misc import xopen


[docs]class fastq():
    """Fastq iterator to extract name, sequence and quality ofr each read."""

    def __init__(self, f, n=-1):

        """
        Specify the file object to iterate.

        :param f: fastq file
        :type f: File
        :param n: number of reads to evaluate, defaults to -1
        :type n: int, optional
        """

        self.fastq = f
        self.n = n
        self.counter = 0

    def __iter__(self):
        return self

    def __next__(self):

        """
        Read the next 4 lines of the fastq file.

        Implement a __next__ magic method to iterate the fastq file
        and return a dictionary with 'name', 'seq' 'sep' and 'qual' keys.
        The 'sep' line is collected for consistency, despite the fact that
        there may be no use for it.

        :raises StopIteration: Maximum umber of reads reached.
        :raises e: End of the file.
        :return: A dictionary with 'name', 'seq' and 'qual' keys.
        :rtype: dict
        """

        if self.counter == self.n:
            raise StopIteration
        try:
            name = next(self.fastq).strip()
            read = next(self.fastq).strip()
            sep = next(self.fastq).strip()
            qual = next(self.fastq).strip()
            self.counter += 1
            return {
                'name': name, 'seq': read,
                'sep': sep, 'qual': qual}
        except StopIteration as e:
            raise e


def parse_fastq_name_illumina_1_8(line):
    line = line.split(' ')
    line = [item.split(':') for item in line]
    machine_id = line[0][0]
    if machine_id.startswith('@'):
        machine_id = machine_id[1:]
    flowcell_id = line[0][2]
    lane_nr = line[0][3]
    mate_nr = line[1][0]
    index_id = line[1][3]
    return {
        'machine_id': machine_id,
        'flowcell_id': flowcell_id,
        'lane': lane_nr,
        'mate': mate_nr,
        'index': index_id
    }


def parse_fastq_name_illumina_1_4(line):
    line = line.split('#')
    line = [item.split(':') for item in line]
    machine_id = line[0][0]
    if machine_id.startswith('@'):
        machine_id = machine_id[1:]
    flowcell_id = None
    lane_nr = line[0][1]
    index_id, mate_nr = line[1][0].split('/')

    return {
        'machine_id': machine_id,
        'flowcell_id': flowcell_id,
        'lane': lane_nr,
        'mate': mate_nr,
        'index': index_id
    }


def parse_fastq_name_illumina_no_index(line):
    line = line.split(':')
    machine_id = line[0]
    if machine_id.startswith('@'):
        machine_id = machine_id[1:]
    flowcell_id = None
    lane_nr = line[1]
    try:
        index_id, mate_nr = line[4].split('/')
    except ValueError:
        mate_nr = None
    finally:
        index_id = None

    return {
        'machine_id': machine_id,
        'flowcell_id': flowcell_id,
        'lane': lane_nr,
        'mate': mate_nr,
        'index': index_id
    }


def fastq_name_info(fastq_file, n=50000, strict=False):
    reads_info = []
    with xopen(fastq_file, 'rt') as fq:
        fastq_iter = fastq(fq, n)
        for item in fastq_iter:
            try:
                reads_info.append(
                    parse_fastq_name_illumina_1_4(item['name']))
            except IndexError:
                try:
                    reads_info.append(
                        parse_fastq_name_illumina_1_8(item['name']))
                except IndexError:
                    reads_info.append(
                        parse_fastq_name_illumina_no_index(item['name']))
    info = reads_info[0]

    if strict:
        all_same = True
        for item in reads_info:
            if item != info:
                all_same = False
        if all_same:
            return info
        raise Exception('Unsuccessful fastq parsing')

    return info