GATK ReadsPathDataSource类介绍

GATK（Genome Analysis Toolkit）是一个广泛使用的基因组分析工具包，它的核心库之一是htsjdk，用于处理高通量测序数据。在GATK中，ReadsPathDataSource类是负责管理和提供读取高通量测序数据文件（如BAM、SAM、CRAM）的类。

常见使用场景

数据加载 ：在GATK的基因组分析工具链中，ReadsPathDataSource 经常被用来从指定路径加载测序数据。
数据过滤 ：通过 ReadsPathDataSource，可以方便地在加载数据的同时进行预过滤，如按特定标准选择感兴趣的序列记录。
多文件支持：支持同时从多个文件中加载数据，使得分析多个样本的数据更加便捷。

类关系

ReadsPathDataSource源码

复制代码

package org.broadinstitute.hellbender.engine;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.MergingSamRecordIterator;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SamFileHeaderMerger;
import htsjdk.samtools.SamInputResource;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.IOUtil;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.iterators.SAMRecordToReadIterator;
import org.broadinstitute.hellbender.utils.iterators.SamReaderQueryingIterator;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadConstants;

import java.io.IOException;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;

/**
 * Manages traversals and queries over sources of reads which are accessible via {@link Path}s
 * (for now, SAM/BAM/CRAM files only).
 *
 * Two basic operations are available:
 *
 * -Iteration over all reads, optionally restricted to reads that overlap a set of intervals
 * -Targeted queries by one interval at a time
 */
public final class ReadsPathDataSource implements ReadsDataSource {
    private static final Logger logger = LogManager.getLogger(ReadsPathDataSource.class);

    /**
     * Mapping from SamReaders to iterators over the reads from each reader. Only one
     * iterator can be open from a given reader at a time (this is a restriction
     * in htsjdk). Iterator is set to null for a reader if no iteration is currently
     * active on that reader.
     */
    private final Map<SamReader, CloseableIterator<SAMRecord>> readers;

    /**
     * Hang onto the input files so that we can print useful errors about them
     */
    private final Map<SamReader, Path> backingPaths;

    /**
     * Only reads that overlap these intervals (and unmapped reads, if {@link #traverseUnmapped} is set) will be returned
     * during a full iteration. Null if iteration is unbounded.
     *
     * Individual queries are unaffected by these intervals -- only traversals initiated via {@link #iterator} are affected.
     */
    private List<SimpleInterval> intervalsForTraversal;

    /**
     * If true, restrict traversals to unmapped reads (and reads overlapping any {@link #intervalsForTraversal}, if set).
     * False if iteration is unbounded or bounded only by our {@link #intervalsForTraversal}.
     *
     * Note that this setting covers only unmapped reads that have no position -- unmapped reads that are assigned the
     * position of their mates will be returned by queries overlapping that position.
     *
     * Individual queries are unaffected by this setting  -- only traversals initiated via {@link #iterator} are affected.
     */
    private boolean traverseUnmapped;

    /**
     * Used to create a merged Sam header when we're dealing with multiple readers. Null if we only have a single reader.
     */
    private final SamFileHeaderMerger headerMerger;

    /**
     * Are indices available for all files?
     */
    private boolean indicesAvailable;

    /**
     * Has it been closed already.
     */
    private boolean isClosed;

    /**
     * Initialize this data source with a single SAM/BAM file and validation stringency SILENT.
     *
     * @param samFile SAM/BAM file, not null.
     */
    public ReadsPathDataSource( final Path samFile ) {
        this(samFile != null ? Arrays.asList(samFile) : null, (SamReaderFactory)null);
    }

    /**
     * Initialize this data source with multiple SAM/BAM files and validation stringency SILENT.
     *
     * @param samFiles SAM/BAM files, not null.
     */
    public ReadsPathDataSource( final List<Path> samFiles ) {
        this(samFiles, (SamReaderFactory)null);
    }

    /**
     * Initialize this data source with a single SAM/BAM file and a custom SamReaderFactory
     *
     * @param samPath path to SAM/BAM file, not null.
     * @param customSamReaderFactory SamReaderFactory to use, if null a default factory with no reference and validation
     *                               stringency SILENT is used.
     */
    public ReadsPathDataSource( final Path samPath, SamReaderFactory customSamReaderFactory ) {
        this(samPath != null ? Arrays.asList(samPath) : null, customSamReaderFactory);
    }

    /**
     * Initialize this data source with multiple SAM/BAM files and a custom SamReaderFactory
     *
     * @param samPaths path to SAM/BAM file, not null.
     * @param customSamReaderFactory SamReaderFactory to use, if null a default factory with no reference and validation
     *                               stringency SILENT is used.
     */
    public ReadsPathDataSource( final List<Path> samPaths, SamReaderFactory customSamReaderFactory ) {
        this(samPaths, null, customSamReaderFactory, 0, 0);
    }

    /**
     * Initialize this data source with multiple SAM/BAM/CRAM files, and explicit indices for those files.
     *
     * @param samPaths paths to SAM/BAM/CRAM files, not null
     * @param samIndices indices for all of the SAM/BAM/CRAM files, in the same order as samPaths. May be null,
     *                   in which case index paths are inferred automatically.
     */
    public ReadsPathDataSource( final List<Path> samPaths, final List<Path> samIndices ) {
        this(samPaths, samIndices, null, 0, 0);
    }

    /**
     * Initialize this data source with multiple SAM/BAM/CRAM files, explicit indices for those files,
     * and a custom SamReaderFactory.
     *
     * @param samPaths paths to SAM/BAM/CRAM files, not null
     * @param samIndices indices for all of the SAM/BAM/CRAM files, in the same order as samPaths. May be null,
     *                   in which case index paths are inferred automatically.
     * @param customSamReaderFactory SamReaderFactory to use, if null a default factory with no reference and validation
     *                               stringency SILENT is used.
     */
    public ReadsPathDataSource( final List<Path> samPaths, final List<Path> samIndices,
                               SamReaderFactory customSamReaderFactory ) {
        this(samPaths, samIndices, customSamReaderFactory, 0, 0);
    }

    /**
     * Initialize this data source with multiple SAM/BAM/CRAM files, explicit indices for those files,
     * and a custom SamReaderFactory.
     *
     * @param samPaths paths to SAM/BAM/CRAM files, not null
     * @param samIndices indices for all of the SAM/BAM/CRAM files, in the same order as samPaths. May be null,
     *                   in which case index paths are inferred automatically.
     * @param customSamReaderFactory SamReaderFactory to use, if null a default factory with no reference and validation
     *                               stringency SILENT is used.
     * @param cloudPrefetchBuffer MB size of caching/prefetching wrapper for the data, if on Google Cloud (0 to disable).
     * @param cloudIndexPrefetchBuffer MB size of caching/prefetching wrapper for the index, if on Google Cloud (0 to disable).
     */
    public ReadsPathDataSource( final List<Path> samPaths, final List<Path> samIndices,
                               SamReaderFactory customSamReaderFactory,
                               int cloudPrefetchBuffer, int cloudIndexPrefetchBuffer) {
        this(samPaths, samIndices, customSamReaderFactory,
                BucketUtils.getPrefetchingWrapper(cloudPrefetchBuffer),
                BucketUtils.getPrefetchingWrapper(cloudIndexPrefetchBuffer) );
    }


    /**
     * Initialize this data source with multiple SAM/BAM/CRAM files, explicit indices for those files,
     * and a custom SamReaderFactory.
     *
     * @param samPaths paths to SAM/BAM/CRAM files, not null
     * @param samIndices indices for all of the SAM/BAM/CRAM files, in the same order as samPaths. May be null,
     *                   in which case index paths are inferred automatically.
     * @param customSamReaderFactory SamReaderFactory to use, if null a default factory with no reference and validation
     *                               stringency SILENT is used.
     * @param cloudWrapper caching/prefetching wrapper for the data, if on Google Cloud.
     * @param cloudIndexWrapper caching/prefetching wrapper for the index, if on Google Cloud.
     */
    public ReadsPathDataSource( final List<Path> samPaths, final List<Path> samIndices,
                               SamReaderFactory customSamReaderFactory,
                               Function<SeekableByteChannel, SeekableByteChannel> cloudWrapper,
                               Function<SeekableByteChannel, SeekableByteChannel> cloudIndexWrapper ) {
        Utils.nonNull(samPaths);
        Utils.nonEmpty(samPaths, "ReadsPathDataSource cannot be created from empty file list");

        if ( samIndices != null && samPaths.size() != samIndices.size() ) {
            throw new UserException(String.format("Must have the same number of BAM/CRAM/SAM paths and indices. Saw %d BAM/CRAM/SAMs but %d indices",
                                                  samPaths.size(), samIndices.size()));
        }

        readers = new LinkedHashMap<>(samPaths.size() * 2);
        backingPaths = new LinkedHashMap<>(samPaths.size() * 2);
        indicesAvailable = true;

        final SamReaderFactory samReaderFactory =
                customSamReaderFactory == null ?
                    SamReaderFactory.makeDefault().validationStringency(ReadConstants.DEFAULT_READ_VALIDATION_STRINGENCY) :
                    customSamReaderFactory;

        int samCount = 0;
        for ( final Path samPath : samPaths ) {
            // Ensure each file can be read
            try {
                IOUtil.assertFileIsReadable(samPath);
            }
            catch ( SAMException|IllegalArgumentException e ) {
                throw new UserException.CouldNotReadInputFile(samPath.toString(), e);
            }

            Function<SeekableByteChannel, SeekableByteChannel> wrapper =
                (BucketUtils.isEligibleForPrefetching(samPath)
                    ? cloudWrapper
                    : Function.identity());
            // if samIndices==null then we'll guess the index name from the file name.
            // If the file's on the cloud, then the search will only consider locations that are also
            // in the cloud.
            Function<SeekableByteChannel, SeekableByteChannel> indexWrapper =
                ((samIndices != null && BucketUtils.isEligibleForPrefetching(samIndices.get(samCount))
                 || (samIndices == null && BucketUtils.isEligibleForPrefetching(samPath)))
                    ? cloudIndexWrapper
                    : Function.identity());

            SamReader reader;
            if ( samIndices == null ) {
                reader = samReaderFactory.open(samPath, wrapper, indexWrapper);
            }
            else {
                final SamInputResource samResource = SamInputResource.of(samPath, wrapper);
                Path indexPath = samIndices.get(samCount);
                samResource.index(indexPath, indexWrapper);
                reader = samReaderFactory.open(samResource);
            }

            // Ensure that each file has an index
            if ( ! reader.hasIndex() ) {
                indicesAvailable = false;
            }

            readers.put(reader, null);
            backingPaths.put(reader, samPath);
            ++samCount;
        }

        // Prepare a header merger only if we have multiple readers
        headerMerger = samPaths.size() > 1 ? createHeaderMerger() : null;
    }

    /**
     * Are indices available for all files?
     */
    public boolean indicesAvailable() {
        return indicesAvailable;
    }

    /**
     * @return true if indices are available for all inputs.
     * This is identical to {@link #indicesAvailable}
     */
    @Override
    public boolean isQueryableByInterval() {
        return indicesAvailable();
    }

    /**
     * Restricts a traversal of this data source via {@link #iterator} to only return reads that overlap the given intervals,
     * and to unmapped reads if specified.
     *
     * Calls to {@link #query} are not affected by this method.
     *
     * @param intervals Our next full traversal will return reads overlapping these intervals
     * @param traverseUnmapped Our next full traversal will return unmapped reads (this affects only unmapped reads that
     *                         have no position -- unmapped reads that have the position of their mapped mates will be
     *                         included if the interval overlapping that position is included).
     */
    @Override
    public void setTraversalBounds( final List<SimpleInterval> intervals, final boolean traverseUnmapped ) {
        // Set intervalsForTraversal to null if intervals is either null or empty
        this.intervalsForTraversal = intervals != null && ! intervals.isEmpty() ? intervals : null;
        this.traverseUnmapped = traverseUnmapped;

        if ( traversalIsBounded() && ! indicesAvailable ) {
            raiseExceptionForMissingIndex("Traversal by intervals was requested but some input files are not indexed.");
        }
    }

    /**
     * @return True if traversals initiated via {@link #iterator} will be restricted to reads that overlap intervals
     *         as configured via {@link #setTraversalBounds}, otherwise false
     */
    @Override
    public boolean traversalIsBounded() {
        return intervalsForTraversal != null || traverseUnmapped;
    }

    private void raiseExceptionForMissingIndex( String reason ) {
        String commandsToIndex = backingPaths.entrySet().stream()
                .filter(f -> !f.getKey().hasIndex())
                .map(Map.Entry::getValue)
                .map(Path::toAbsolutePath)
                .map(f -> "samtools index " + f)
                .collect(Collectors.joining("\n","\n","\n"));

        throw new UserException(reason + "\nPlease index all input files:\n" + commandsToIndex);
    }

    /**
     * Iterate over all reads in this data source. If intervals were provided via {@link #setTraversalBounds},
     * iteration is limited to reads that overlap that set of intervals.
     *
     * @return An iterator over the reads in this data source, limited to reads that overlap the intervals supplied
     *         via {@link #setTraversalBounds} (if intervals were provided)
     */
    @Override
    public Iterator<GATKRead> iterator() {
        logger.debug("Preparing readers for traversal");
        return prepareIteratorsForTraversal(intervalsForTraversal, traverseUnmapped);
    }

    /**
     * Query reads over a specific interval. This operation is not affected by prior calls to
     * {@link #setTraversalBounds}
     *
     * @param interval The interval over which to query
     * @return Iterator over reads overlapping the query interval
     */
    @Override
    public Iterator<GATKRead> query( final SimpleInterval interval ) {
        if ( ! indicesAvailable ) {
            raiseExceptionForMissingIndex("Cannot query reads data source by interval unless all files are indexed");
        }

        return prepareIteratorsForTraversal(Arrays.asList(interval));
    }

    /**
     * @return An iterator over just the unmapped reads with no assigned position. This operation is not affected
     *         by prior calls to {@link #setTraversalBounds}. The underlying file must be indexed.
     */
    @Override
    public Iterator<GATKRead> queryUnmapped() {
        if ( ! indicesAvailable ) {
            raiseExceptionForMissingIndex("Cannot query reads data source by interval unless all files are indexed");
        }

        return prepareIteratorsForTraversal(null, true);
    }

    /**
     * Returns the SAM header for this data source. Will be a merged header if there are multiple readers.
     * If there is only a single reader, returns its header directly.
     *
     * @return SAM header for this data source
     */
    @Override
    public SAMFileHeader getHeader() {
        return headerMerger != null ? headerMerger.getMergedHeader() : readers.entrySet().iterator().next().getKey().getFileHeader();
    }

    /**
     * Prepare iterators over all readers in response to a request for a complete iteration or query
     *
     * If there are multiple intervals, they must have been optimized using QueryInterval.optimizeIntervals()
     * before calling this method.
     *
     * @param queryIntervals Intervals to bound the iteration (reads must overlap one of these intervals). If null, iteration is unbounded.
     * @return Iterator over all reads in this data source, limited to overlap with the supplied intervals
     */
    private Iterator<GATKRead> prepareIteratorsForTraversal( final List<SimpleInterval> queryIntervals ) {
        return prepareIteratorsForTraversal(queryIntervals, false);
    }

    /**
     * Prepare iterators over all readers in response to a request for a complete iteration or query
     *
     * @param queryIntervals Intervals to bound the iteration (reads must overlap one of these intervals). If null, iteration is unbounded.
     * @return Iterator over all reads in this data source, limited to overlap with the supplied intervals
     */
    private Iterator<GATKRead> prepareIteratorsForTraversal( final List<SimpleInterval> queryIntervals, final boolean queryUnmapped ) {
        // htsjdk requires that only one iterator be open at a time per reader, so close out
        // any previous iterations
        closePreviousIterationsIfNecessary();

        final boolean traversalIsBounded = (queryIntervals != null && ! queryIntervals.isEmpty()) || queryUnmapped;

        // Set up an iterator for each reader, bounded to overlap with the supplied intervals if there are any
        for ( Map.Entry<SamReader, CloseableIterator<SAMRecord>> readerEntry : readers.entrySet() ) {
            if (traversalIsBounded) {
                readerEntry.setValue(
                        new SamReaderQueryingIterator(
                                readerEntry.getKey(),
                                readers.size() > 1 ?
                                        getIntervalsOverlappingReader(readerEntry.getKey(), queryIntervals) :
                                        queryIntervals,
                                queryUnmapped
                        )
                );
            } else {
                readerEntry.setValue(readerEntry.getKey().iterator());
            }
        }

        // Create a merging iterator over all readers if necessary. In the case where there's only a single reader,
        // return its iterator directly to avoid the overhead of the merging iterator.
        Iterator<SAMRecord> startingIterator = null;
        if ( readers.size() == 1 ) {
            startingIterator = readers.entrySet().iterator().next().getValue();
        }
        else {
            startingIterator = new MergingSamRecordIterator(headerMerger, readers, true);
        }

        return new SAMRecordToReadIterator(startingIterator);
    }

    /**
     * Reduce the intervals down to only include ones that can actually intersect with this reader
     */
    private List<SimpleInterval> getIntervalsOverlappingReader(
            final SamReader samReader,
            final List<SimpleInterval> queryIntervals )
    {
        final SAMSequenceDictionary sequenceDictionary = samReader.getFileHeader().getSequenceDictionary();
        return queryIntervals.stream()
                .filter(interval -> IntervalUtils.intervalIsOnDictionaryContig(interval, sequenceDictionary))
                .collect(Collectors.toList());
    }

    /**
     * Create a header merger from the individual SAM/BAM headers in our readers
     *
     * @return a header merger containing all individual headers in this data source
     */
    private SamFileHeaderMerger createHeaderMerger() {
        List<SAMFileHeader> headers = new ArrayList<>(readers.size());
        for ( Map.Entry<SamReader, CloseableIterator<SAMRecord>> readerEntry : readers.entrySet() ) {
            headers.add(readerEntry.getKey().getFileHeader());
        }

        SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(identifySortOrder(headers), headers, true);
        return headerMerger;
    }

    @VisibleForTesting
    static SAMFileHeader.SortOrder identifySortOrder( final List<SAMFileHeader> headers ){
        final Set<SAMFileHeader.SortOrder> sortOrders = headers.stream().map(SAMFileHeader::getSortOrder).collect(Collectors.toSet());
        final SAMFileHeader.SortOrder order;
        if (sortOrders.size() == 1) {
            order = sortOrders.iterator().next();
        } else {
            order = SAMFileHeader.SortOrder.unsorted;
            logger.warn("Inputs have different sort orders. Assuming {} sorted reads for all of them.", order);
        }
        return order;
    }

    /**
     * @return true if this {@code ReadsPathDataSource} supports serial iteration (has only non-SAM inputs). If any
     * input has type==SAM_TYPE (is backed by a SamFileReader) this will return false, since SamFileReader
     * doesn't support serial iterators, and can't be serially re-traversed without re-initialization of the
     * underlying reader (and {@code ReadsPathDataSource}.
     */
    public boolean supportsSerialIteration() {
        return !hasSAMInputs();
    }

    /**
     * Shut down this data source permanently, closing all iterations and readers.
     */
    @Override
    public void close() {
        if (isClosed) {
            return;
        }
        isClosed = true;
        closePreviousIterationsIfNecessary();

        try {
            for ( Map.Entry<SamReader, CloseableIterator<SAMRecord>> readerEntry : readers.entrySet() ) {
                readerEntry.getKey().close();
            }
        }
        catch ( IOException e ) {
            throw new GATKException("Error closing SAMReader");
        }
    }

    boolean isClosed() {
        return isClosed;
    }

    /**
     * Close any previously-opened iterations over our readers (htsjdk allows only one open iteration per reader).
     */
    private void closePreviousIterationsIfNecessary() {
        for ( Map.Entry<SamReader, CloseableIterator<SAMRecord>> readerEntry : readers.entrySet() ) {
            CloseableIterator<SAMRecord> readerIterator = readerEntry.getValue();
            if ( readerIterator != null ) {
                readerIterator.close();
                readerEntry.setValue(null);
            }
        }
    }

    // Return true if any input is has type==SAM_TYPE (is backed by a SamFileReader) since SamFileReader
    // doesn't support serial iterators and can't be serially re-traversed without re-initialization of the
    // reader
    private boolean hasSAMInputs() {
        return readers.keySet().stream().anyMatch(r -> r.type().equals(SamReader.Type.SAM_TYPE));
    }

    /**
     * Get the sequence dictionary for this ReadsPathDataSource
     *
     * @return SAMSequenceDictionary from the SAMReader backing this if there is only 1 input file, otherwise the merged SAMSequenceDictionary from the merged header
     */
    @Override
    public SAMSequenceDictionary getSequenceDictionary() {
        return getHeader().getSequenceDictionary();
    }


}

ReadsDataSource源码

复制代码

package org.broadinstitute.hellbender.engine;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.read.GATKRead;

import java.util.Iterator;
import java.util.List;

/**
 *
 * An interface for managing traversals over sources of reads.
 *
 * Two basic operations are available:
 *
 * -Iteration over all reads, optionally restricted to reads that overlap a set of intervals
 * -Targeted queries by one interval at a time
 */
public interface ReadsDataSource extends GATKDataSource<GATKRead>, AutoCloseable {

    /**
     * Restricts a traversal of this data source via {@link #iterator} to only return reads that overlap the given intervals,
     * and to unmapped reads if specified.
     *
     * Calls to {@link #query} are not affected by this method.
     *
     * @param intervals Our next full traversal will return reads overlapping these intervals
     * @param traverseUnmapped Our next full traversal will return unmapped reads (this affects only unmapped reads that
     *                         have no position -- unmapped reads that have the position of their mapped mates will be
     *                         included if the interval overlapping that position is included).
     */
    void setTraversalBounds(List<SimpleInterval> intervals, boolean traverseUnmapped);

    /**
     * Restricts a traversal of this data source via {@link #iterator} to only return reads which overlap the given intervals.
     * Calls to {@link #query} are not affected by setting these intervals.
     *
     * @param intervals Our next full traversal will return only reads overlapping these intervals
     */
    default void setTraversalBounds(List<SimpleInterval> intervals) {
        setTraversalBounds(intervals, false);
    }

    /**
     * Restricts a traversal of this data source via {@link #iterator} to only return reads that overlap the given intervals,
     * and to unmapped reads if specified.
     *
     * Calls to {@link #query} are not affected by this method.
     *
     * @param traversalParameters set of traversal parameters to control which reads get returned by the next call
     *                            to {@link #iterator}
     */
    default void setTraversalBounds(TraversalParameters traversalParameters){
        setTraversalBounds(traversalParameters.getIntervalsForTraversal(), traversalParameters.traverseUnmappedReads());
    }

    /**
     * @return true if traversals initiated via {@link #iterator} will be restricted to reads that overlap intervals
     *         as configured via {@link #setTraversalBounds}, otherwise false
     */
    boolean traversalIsBounded();

    /**
     * @return true if this datasource supports the query() operation otherwise false.
     */
    boolean isQueryableByInterval();

    /**
     * @return An iterator over just the unmapped reads with no assigned position. This operation is not affected
     *         by prior calls to {@link #setTraversalBounds}. The underlying file must be indexed.
     */
    Iterator<GATKRead> queryUnmapped();

    /**
     * Returns the SAM header for this data source.
     *
     * @return SAM header for this data source
     */
    SAMFileHeader getHeader();

    /**
     * Get the sequence dictionary for this ReadsDataSource
     *
     * @return SAMSequenceDictionary for the reads backing this datasource.
     */
    default SAMSequenceDictionary getSequenceDictionary(){
        return getHeader().getSequenceDictionary();
    }

    /**
     * @return true if this {@code ReadsDataSource} supports multiple iterations over the data
     */
    boolean supportsSerialIteration();

    /**
     * Shut down this data source permanently, closing all iterations and readers.
     */
    @Override  //Overriden here to disallow throwing checked exceptions.
    void close();
}

GATKDataSource源码

复制代码

package org.broadinstitute.hellbender.engine;

import org.broadinstitute.hellbender.utils.SimpleInterval;

import java.util.Iterator;

/**
 * A GATKDataSource is something that can be iterated over from start to finish
 * and/or queried by genomic interval. It is not necessarily file-based.
 *
 * @param <T> Type of data in the data source
 */
public interface GATKDataSource<T> extends Iterable<T> {
    Iterator<T> query(final SimpleInterval interval);
}