from __future__ import division # py3 "true division"
import os
import random
import itertools
from gensim import utils, matutils
import logging
logger = logging.getLogger(__name__)
class PathLineSentences:
def __init__(self, source, max_sentence_length=500, limit=None):
"""Like :class:`~gensim.models.word2vec.LineSentence`, but process all files in a directory
in alphabetical order by filename.
The directory must only contain files that can be read by :class:`gensim.models.word2vec.LineSentence`:
.bz2, .gz, and text files. Any file not ending with .bz2 or .gz is assumed to be a text file.
The format of files (either text, or compressed text files) in the path is one sentence = one line,
with words already preprocessed and separated by whitespace.
Does **not recurse** into subdirectories.
source : str
Path to the directory.
limit : int or None
Read only the first `limit` lines from each file. Read all if limit is None (the default).
self.source = source
self.max_sentence_length = max_sentence_length
self.limit = limit
if os.path.isfile(self.source):
logger.debug('single file given as source, rather than a directory of files')
logger.debug('consider using models.word2vec.LineSentence for a single file')
self.input_files = [self.source] # force code compatibility with list of files
elif os.path.isdir(self.source):
self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path
logger.info('reading directory %s', self.source)
self.input_files = os.listdir(self.source)
self.input_files = [self.source + filename for filename in self.input_files] # make full paths
self.input_files.sort() # makes sure it happens in filename order
else: # not a file or a directory, then we can't do anything with it
raise ValueError('input is neither a file nor a path')
logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files))
def __iter__(self):
"""iterate through the files"""
for file_name in self.input_files:
logger.info('reading file %s', file_name)
with utils.open(file_name, 'rb') as fin:
for line in itertools.islice(fin, self.limit):
line = utils.to_unicode(line).split('\t')[1].strip().split(',')
for i in range(25):
# print(line)
i = 0
while i < len(line):
yield line[
i: i + self.max_sentence_length] # if max_sentence_length = 10:[d1....d10] / [d10...d20]
i += self.max_sentence_length
# print(line)
來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/69946337/viewspace-2790151/,如需轉載,請註明出處,否則將追究法律責任。
