from __future__ import division # py3 "true division"

import os

import random

import itertools

from gensim import utils, matutils

import logging

logger = logging.getLogger(__name__)

class PathLineSentences:

def __init__(self, source, max_sentence_length=500, limit=None):

"""Like :class:`~gensim.models.word2vec.LineSentence`, but process all files in a directory

in alphabetical order by filename.

The directory must only contain files that can be read by :class:`gensim.models.word2vec.LineSentence`:

.bz2, .gz, and text files. Any file not ending with .bz2 or .gz is assumed to be a text file.

The format of files (either text, or compressed text files) in the path is one sentence = one line,

with words already preprocessed and separated by whitespace.

Warnings

--------

Does **not recurse** into subdirectories.

Parameters

----------

source : str

Path to the directory.

limit : int or None

Read only the first `limit` lines from each file. Read all if limit is None (the default).

"""

self.source = source

self.max_sentence_length = max_sentence_length

self.limit = limit

if os.path.isfile(self.source):

logger.debug('single file given as source, rather than a directory of files')

logger.debug('consider using models.word2vec.LineSentence for a single file')

self.input_files = [self.source] # force code compatibility with list of files

elif os.path.isdir(self.source):

self.source = os.path.join(self.source, '') 外匯跟單gendan5.com# ensures os-specific slash at end of path

logger.info('reading directory %s', self.source)

self.input_files = os.listdir(self.source)

self.input_files = [self.source + filename for filename in self.input_files] # make full paths

self.input_files.sort() # makes sure it happens in filename order

else: # not a file or a directory, then we can't do anything with it

raise ValueError('input is neither a file nor a path')

logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files))

def __iter__(self):

"""iterate through the files"""

for file_name in self.input_files:

logger.info('reading file %s', file_name)

with utils.open(file_name, 'rb') as fin:

for line in itertools.islice(fin, self.limit):

line = utils.to_unicode(line).split('\t')[1].strip().split(',')

for i in range(25):

random.shuffle(line)

# print(line)

i = 0

while i < len(line):

yield line[

i: i + self.max_sentence_length] # if max_sentence_length = 10:[d1....d10] / [d10...d20]

i += self.max_sentence_length

# print(line)

word2vec實現域名向量化並計算相似度-python程式碼

相關文章