word2vec實現域名向量化並計算相似度-python程式碼

專注的阿熊發表於2021-09-02

from __future__ import division  # py3 "true division"

import os

import random

import itertools

from gensim import utils, matutils

import logging

logger = logging.getLogger(__name__)

class PathLineSentences:

     def __init__(self, source, max_sentence_length=500, limit=None):

         """Like :class:`~gensim.models.word2vec.LineSentence`, but process all files in a directory

         in alphabetical order by filename.

         The directory must only contain files that can be read by :class:`gensim.models.word2vec.LineSentence`:

         .bz2, .gz, and text files. Any file not ending with .bz2 or .gz is assumed to be a text file.

         The format of files (either text, or compressed text files) in the path is one sentence = one line,

         with words already preprocessed and separated by whitespace.

         Warnings

         --------

         Does **not recurse** into subdirectories.

         Parameters

         ----------

         source : str

             Path to the directory.

         limit : int or None

             Read only the first `limit` lines from each file. Read all if limit is None (the default).

         """

         self.source = source

         self.max_sentence_length = max_sentence_length

         self.limit = limit

         if os.path.isfile(self.source):

             logger.debug('single file given as source, rather than a directory of files')

             logger.debug('consider using models.word2vec.LineSentence for a single file')

             self.input_files = [self.source]  # force code compatibility with list of files

         elif os.path.isdir(self.source):

             self.source = os.path.join(self.source, '')  外匯跟單gendan5.com# ensures os-specific slash at end of path

             logger.info('reading directory %s', self.source)

             self.input_files = os.listdir(self.source)

             self.input_files = [self.source + filename for filename in self.input_files]  # make full paths

             self.input_files.sort()  # makes sure it happens in filename order

         else:  # not a file or a directory, then we can't do anything with it

             raise ValueError('input is neither a file nor a path')

         logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files))

     def __iter__(self):

         """iterate through the files"""

         for file_name in self.input_files:

             logger.info('reading file %s', file_name)

             with utils.open(file_name, 'rb') as fin:

                 for line in itertools.islice(fin, self.limit):

                     line = utils.to_unicode(line).split('\t')[1].strip().split(',')

                     for i in range(25):

                         random.shuffle(line)

                         # print(line)

                     i = 0

                     while i < len(line):

                         yield line[

                             i: i + self.max_sentence_length]  # if max_sentence_length = 10:[d1....d10] / [d10...d20]

                         i += self.max_sentence_length

                         # print(line)


來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/69946337/viewspace-2790151/,如需轉載,請註明出處,否則將追究法律責任。

相關文章