Source code for diceware.wordlist

#  diceware -- passphrases to remember
#  Copyright (C) 2015-2017  Uli Fouquet and contributors.
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""wordlist.py -- special handling of wordlists.
"""
import os
import re
import sys
import tempfile

#: Maximum in-memory file size in bytes (20 MB).
#:
#: This value is used when creating temporary files replacing
#: unseekable input streams. If an input file is larger, we write to
#: disk.
MAX_IN_MEM_SIZE = 20 * 1024 * 1024

#: A regular expression matching allowed wordlist names. We
#: allow names that cannot easily mess up filesystems.
RE_WORDLIST_NAME = re.compile(r'^[\w-]+$')

#: A regular expression matching numbered entries in wordlists.
RE_NUMBERED_WORDLIST_ENTRY = re.compile(r'^[0-9]+(\-[0-9]+)*\s+([^\s]+)$')

#: A regular expression describing valid wordlist file names.
RE_VALID_WORDLIST_FILENAME = re.compile(
    r'^wordlist_([\w-]+)\.[\w][\w\.]+[\w]+$')


[docs]def get_wordlists_dir():
    """Get the directory in which word lists are stored.
    """
    return os.path.abspath(
        os.path.join(os.path.dirname(__file__), 'wordlists'))


[docs]def get_wordlist_names():
    """Get a all names of wordlists stored locally.
    """
    result = []
    wordlists_dir = get_wordlists_dir()
    filenames = os.listdir(wordlists_dir)
    for filename in filenames:
        if not os.path.isfile(os.path.join(wordlists_dir, filename)):
            continue
        match = RE_VALID_WORDLIST_FILENAME.match(filename)
        if not match:
            continue
        result.append(match.groups()[0])
    return sorted(result)


[docs]def get_wordlist_path(name):
    """Get path to a wordlist file for a wordlist named `name`.

    The `name` string must not contain special chars beside ``-``,
    ``_``, regular chars ``A-Z`` (upper or lower case) or
    numbers. Invalid names raise a ValueError.

    If a path with the given name (names are not filenames here) does
    not exist, `None` is returned.
    """
    if not RE_WORDLIST_NAME.match(name):
        raise ValueError("Not a valid wordlist name: %s" % name)
    wordlists_dir = get_wordlists_dir()
    for filename in os.listdir(wordlists_dir):
        if not os.path.isfile(os.path.join(wordlists_dir, filename)):
            continue
        match = RE_VALID_WORDLIST_FILENAME.match(filename)
        if match and match.groups()[0] == name:
            return os.path.join(wordlists_dir, filename)


[docs]class WordList(object):
    """A word list contains words for building passphrases.

    `path` is the path of the wordlist file. With single dash (``-``) as path,
    we read from `sys.stdin`.

    In case input comes from stdin, we write the input stream into a file if
    the content length is larger than `MAX_IN_MEM_SIZE`. Otherwise, the
    wordlist is kept in memory.

    Wordlist files are expected to contain words, one word per line. Empty
    lines are ignored, also whitespaces before or trailing a line are
    stripped. If a "word" contains inner whitespaces, then these are
    preserved.

    The input file can be a signed wordlist. Signed wordlists are expected to
    be ordinary lists of words but with ASCII armored signatures (as described
    in RFC 4880).

    In case of signed wordlists the signature headers/footers are stripped and
    the contained list of words is read.

    WordList are generators. That means, that you can retrieve the words of a
    wordlist by iterating over an instance of `WordList`.

    """
    def __init__(self, path):
        self.path = path
        self.fd = None
        if self.path == "-":
            self.fd = tempfile.SpooledTemporaryFile(
                    max_size=MAX_IN_MEM_SIZE, mode="w+")
            self.fd.write(sys.stdin.read())
            self.fd.seek(0)
        else:
            self.fd = open(self.path, "r")
        self.signed = self.is_signed()

    def __del__(self):
        if self.path != "-" and self.fd is not None:
            self.fd.close()

    def __iter__(self):
        self.fd.seek(0)
        if self.signed:
            while self.fd.readline().strip():
                # wait for first empty line
                pass
        for line in self.fd:
            line = self.refine_entry(line)
            if not line:
                continue
            elif self.signed and line == '-----BEGIN PGP SIGNATURE-----':
                break
            yield line

[docs]    def is_signed(self):
        """check, whether this file is cryptographically signed.

        This operation is expensive and resets the file descriptor to
        the beginning of file.
        """
        self.fd.seek(0)
        line1 = self.fd.readline()
        self.fd.seek(0)
        if line1.rstrip() == "-----BEGIN PGP SIGNED MESSAGE-----":
            return True
        return False

[docs]    def refine_entry(self, entry):
        """Apply modifications to form a proper wordlist entry.

        Refining means: strip() `entry` remove escape-dashes (if this is
        a signed wordlist) and extract the term if it is preceded by
        numbers.
        """
        if self.signed and entry.startswith('- '):
            entry = entry[2:]
        entry = entry.strip()
        match = RE_NUMBERED_WORDLIST_ENTRY.match(entry)
        if match:
            entry = match.groups()[1]
        return entry