|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import logging |
|
import re |
|
from enum import Enum |
|
|
|
from ..multithreading import Counters |
|
from ..pyclass import autoclass, cast, JPaths |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
JFileSegment = autoclass('io.anserini.collection.FileSegment') |
|
JSourceDocument = autoclass('io.anserini.collection.SourceDocument') |
|
|
|
|
|
class JCollections(Enum): |
|
AclAnthology = autoclass('io.anserini.collection.AclAnthology') |
|
CarCollection = autoclass('io.anserini.collection.CarCollection') |
|
Cord19AbstractCollection = autoclass('io.anserini.collection.Cord19AbstractCollection') |
|
ClueWeb09Collection = autoclass('io.anserini.collection.ClueWeb09Collection') |
|
ClueWeb12Collection = autoclass('io.anserini.collection.ClueWeb12Collection') |
|
HtmlCollection = autoclass('io.anserini.collection.HtmlCollection') |
|
JsonCollection = autoclass('io.anserini.collection.JsonCollection') |
|
NewYorkTimesCollection = autoclass('io.anserini.collection.NewYorkTimesCollection') |
|
TrecCollection = autoclass('io.anserini.collection.TrecCollection') |
|
TrecwebCollection = autoclass('io.anserini.collection.TrecwebCollection') |
|
TweetCollection = autoclass('io.anserini.collection.TweetCollection') |
|
WashingtonPostCollection = autoclass('io.anserini.collection.WashingtonPostCollection') |
|
WikipediaCollection = autoclass('io.anserini.collection.WikipediaCollection') |
|
|
|
|
|
class Collection: |
|
""" |
|
Iterable wrapper class for Anserini's DocumentCollection. |
|
|
|
Parameters |
|
---------- |
|
collection_class : str |
|
Name of collection class to instantiate |
|
collection_path : str |
|
Path to directory containing collection |
|
""" |
|
|
|
def __init__(self, collection_class, collection_path): |
|
self.counters = Counters() |
|
self.collection_class = collection_class |
|
self.collection_path = JPaths.get(collection_path) |
|
self.object = self._get_collection() |
|
self.collection_iterator = self.object.iterator() |
|
|
|
def _get_collection(self): |
|
try: |
|
return JCollections[self.collection_class].value(self.collection_path) |
|
except: |
|
raise ValueError(self.collection_class) |
|
|
|
def __iter__(self): |
|
return self |
|
|
|
def __next__(self): |
|
if self.collection_iterator.hasNext(): |
|
fs = self.collection_iterator.next() |
|
return FileSegment(self, fs, fs.getSegmentPath()) |
|
else: |
|
raise StopIteration |
|
|
|
|
|
class FileSegment: |
|
""" |
|
Iterable wrapper class for Anserini's FileSegment. |
|
|
|
Parameters |
|
---------- |
|
collection : Collection |
|
Parent collection of the file segment |
|
segment : JFileSegment |
|
FileSegment object to create wrapper from |
|
segment_path : str |
|
Path to file backing the file segment |
|
""" |
|
|
|
def __init__(self, collection, segment, segment_path): |
|
self.collection = collection |
|
try: |
|
self.object = cast(collection.object.getClass().getName() + |
|
'$Segment', segment) |
|
except: |
|
logger.exception('Exception from casting FileSegment type...') |
|
self.object = cast('io.anserini.collection.FileSegment', segment) |
|
|
|
self.segment_iterator = self.object.iterator() |
|
self.segment_path = segment_path |
|
self.segment_name = re.sub(r'\\|\/', '-', collection.collection_path.relativize(segment_path).toString()) |
|
|
|
def __iter__(self): |
|
return self |
|
|
|
def __next__(self): |
|
if self.object.iterator().hasNext(): |
|
d = self.object.iterator().next() |
|
return SourceDocument(self, d) |
|
else: |
|
|
|
if self.object.getErrorStatus(): |
|
logger.error(self.segment_name + ': Error from segment iteration, stopping...') |
|
self.collection.counters.errors.increment() |
|
|
|
|
|
skipped = self.object.getSkippedCount() |
|
if skipped > 0: |
|
self.collection.counters.skips.increment(skipped) |
|
logger.warning(self.segment_name + ': ' + str(skipped) + ' documents skipped') |
|
self.object.close() |
|
raise StopIteration |
|
|
|
|
|
class SourceDocument: |
|
""" |
|
Wrapper class for Anserini's SourceDocument. |
|
|
|
Parameters |
|
---------- |
|
|
|
segment : FileSegment |
|
Parent segment of the source document |
|
document : io.anserini.collection.SourceDocument |
|
SourceDocument object to create wrapper from |
|
""" |
|
|
|
def __init__(self, segment, document): |
|
if not isinstance(document, JSourceDocument): |
|
raise TypeError('Invalid JSourceDocument!') |
|
self.segment = segment |
|
self.object = document |
|
self.id = self.object.id() |
|
self.indexable = self.object.indexable() |
|
self.contents = self.object.contents() |
|
self.raw = self.object.raw() |
|
|