Source code for diceware.wordlist

#  diceware -- passphrases to remember
#  Copyright (C) 2015, 2016  Uli Fouquet and contributors.
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""wordlist.py -- special handling of wordlists.
"""
import os
import re
import tempfile

#: Maximum in-memory file size in bytes (20 MB).
#:
#: This value is used when creating temporary files replacing
#: unseekable input streams. If an input file is larger, we write to
#: disk.
MAX_IN_MEM_SIZE = 20 * 1024 * 1024

#: The directory in which wordlists are stored
WORDLISTS_DIR = os.path.abspath(
    os.path.join(os.path.dirname(__file__), 'wordlists'))

#: A regular expression matching allowed wordlist names. We
#: allow names that cannot easily mess up filesystems.
RE_WORDLIST_NAME = re.compile('^[\w-]+$')

#: A regular expression matching numbered entries in wordlists.
RE_NUMBERED_WORDLIST_ENTRY = re.compile('^[0-9]+(\-[0-9]+)*\s+([^\s]+)$')

#: A regular expression describing valid wordlist file names.
RE_VALID_WORDLIST_FILENAME = re.compile(
    '^wordlist_([\w-]+)\.[\w][\w\.]+[\w]+$')


[docs]def get_wordlist_names():
    """Get a all names of wordlists stored locally.
    """
    result = []
    filenames = os.listdir(WORDLISTS_DIR)
    for filename in filenames:
        if not os.path.isfile(os.path.join(WORDLISTS_DIR, filename)):
            continue
        match = RE_VALID_WORDLIST_FILENAME.match(filename)
        if not match:
            continue
        result.append(match.groups()[0])
    return sorted(result)


[docs]def get_wordlist_path(name):
    """Get path to a wordlist file for a wordlist named `name`.

    The `name` string must not contain special chars beside ``-``,
    ``_``, regular chars ``A-Z`` (upper or lower case) or
    numbers. Invalid names raise a ValueError.

    If a path with the given name (names are not filenames here) does
    not exist, `None` is returned.
    """
    if not RE_WORDLIST_NAME.match(name):
        raise ValueError("Not a valid wordlist name: %s" % name)
    for filename in os.listdir(WORDLISTS_DIR):
        if not os.path.isfile(os.path.join(WORDLISTS_DIR, filename)):
            continue
        match = RE_VALID_WORDLIST_FILENAME.match(filename)
        if match and match.groups()[0] == name:
            return os.path.join(WORDLISTS_DIR, filename)


[docs]class WordList(object):
    """A word list contains words for building passphrases.

    `path_or_filelike` is the path of the wordlist file or an already
    opened file. Opened files must be open for reading, of course. We
    expect filelike objects to support at least `read()`.

    If a file-like object does not support `seek()` (like `sys.stdin`),
    we create a temporary, seekable copy of the input stream. The copy
    is written to disk only, if it is larger than
    `MAX_IN_MEM_SIZE`. Otherwise the wordlist is kept in memory.

    Please note that open file descriptors are not closed after reading.

    Wordlist files are expected to contain words, one word per
    line. Empty lines are ignored, also whitespaces before or trailing
    a line are stripped. If a "word" contains inner whitespaces, then
    these are preserved.

    The input file can be a signed wordlist. Signed wordlists are
    expected to be ordinary lists of words but with ASCII armored
    signatures (as described in RFC 4880).

    In case of signed wordlists the signature headers/footers are
    stripped and the contained list of words is read.

    WordList are generators. That means, that you can retrieve the
    words of a wordlist by iterating over an instance of `WordList`.

    """
    def __init__(self, path_or_filelike=None):
        self.path = None
        if not hasattr(path_or_filelike, 'read'):
            # got a path, not a filelike object
            self.path = path_or_filelike
            self.fd = open(self.path, "r")
        else:
            self.fd = path_or_filelike
            try:
                self.fd.seek(0)
            except IOError:
                # the given filelike does not support seek(). Create an own.
                self.fd = tempfile.SpooledTemporaryFile(
                    max_size=MAX_IN_MEM_SIZE, mode="w+")
                self.fd.write(path_or_filelike.read())
                self.fd.seek(0)
        self.signed = self.is_signed()

    def __iter__(self):
        self.fd.seek(0)
        if self.signed:
            while self.fd.readline().strip():
                # wait for first empty line
                pass
        for line in self.fd:
            line = self.refine_entry(line)
            if not line:
                continue
            elif self.signed and line == '-----BEGIN PGP SIGNATURE-----':
                break
            yield line

[docs]    def is_signed(self):
        """check, whether this file is cryptographically signed.

        This operation is expensive and resets the file descriptor to
        the beginning of file.
        """
        self.fd.seek(0)
        line1 = self.fd.readline()
        self.fd.seek(0)
        if line1.rstrip() == "-----BEGIN PGP SIGNED MESSAGE-----":
            return True
        return False

[docs]    def refine_entry(self, entry):
        """Apply modifications to form a proper wordlist entry.

        Refining means: strip() `entry` remove escape-dashes (if this is
        a signed wordlist) and extract the term if it is preceded by
        numbers.
        """
        if self.signed and entry.startswith('- '):
            entry = entry[2:]
        entry = entry.strip()
        match = RE_NUMBERED_WORDLIST_ENTRY.match(entry)
        if match:
            entry = match.groups()[1]
        return entry