Stress testing/Python script: Generating a large number of notes and tags

Background

My personal joplin notebook only has a few dozen tags, but I want to be able to test performance issues referenced in discussion posts like this one.

Script for generating a large Joplin notebook

Below is a (very messy) python script that creates a Joplin export directory with many, many tags.

:warning: Do not import the directory generated by this script unless you have

  1. Backed up all of your Joplin settings and data
  2. Turned off sync.
Show code
#!/bin/python3
import os, uuid, datetime, re, random

##
## A rather messy script that, given a directory of text files,
## creates a large Joplin notebook.
##
## This script is published in the hope that others may find it useful.
## (Consider it licensed under the MIT public license).
##

# Path to a directory with large text files!
# I fetched several books (as .txt files) from Project Gutenberg
# and stored them in a directory
BOOKS_DIR = "/path/to/some/books"

OUT_DIR = os.path.dirname(__file__) + "/out"
BOOK_PATHS = [ BOOKS_DIR + "/" + p for p in os.listdir(BOOKS_DIR) ]
WORD_REGEX = re.compile(r'(?:[^a-zA-Z]|^)([a-zA-Z]+)(?:[^a-zA-Z]|$)')

# Probability that a given word will be ignored (when
# reading a portion of the input data and considering whether to
# track a word).
CHANCE_OF_IGNORE_WORD = 0.7

def timestamp():
    return datetime.datetime.now().isoformat()

def addStandardLines(lines, parentId, newlineChar='\n'):
    now = timestamp()
    content = [
        "created_time: {}Z".format(now),
        "updated_time: {}Z".format(now),
        "user_created_time: {}Z".format(now),
        "user_updated_time: {}Z".format(now),
        "encryption_cipher_text: ",
        "encryption_applied: 0",
    ]
    if parentId != None:
        content.append(
            "parent_id: {}".format(parentId)
        )
    content.extend([
        "is_shared: 0",
        "share_id: ",
        "master_key_id: ",
    ])
    lines.extend([ l + newlineChar for l in content ])

def getUUID():
    return str(uuid.uuid4()).replace('-', '')

def mdPath(uuid):
    return "{}/{}.md".format(OUT_DIR, uuid)

# Process book
def processBook(path):
    notebookUUID = getUUID()
    print("Processing", path)

    # Write the notebook spec file.
    with open(mdPath(notebookUUID), 'w') as f:
        lines = [
            os.path.basename(path) + "\n",
            "\n",
            "id: {}\n".format(notebookUUID),
        ]
        addStandardLines(lines, "")
        lines.append("icon: \n")
        lines.append("type_: 2")
        f.writelines(lines)

    # Write chunks of the books as .md files.
    chunks = ["First paragraph.\n\n"]
    links = {}
    words = {}
    paragraphNo = 1
    isFirst = False

    with open(path, 'r') as f:
        for line in f:
            if line.strip() == "" and not isFirst:
                paragraphNo += 1
                chunks.append("Paragraph " + str(paragraphNo) + "\n\n")
                isFirst = True
            else:
                isFirst = (line.strip() == "")
                chunks[-1] += line.rstrip() + " "

    for chunk in chunks:
        noteUUID = getUUID()
        for word in re.findall(WORD_REGEX, chunk):
            if random.random() > CHANCE_OF_IGNORE_WORD:
                word = word.lower()
                if not (word in words):
                    words[word] = []
                words[word].append(noteUUID)

        print("Writing", mdPath(noteUUID))
        with open(mdPath(noteUUID), 'w') as f:
            lines = [chunk]
            lines.append("")
            lines.append("")
            lines.append("Which has length, {}".format(len(chunk)))
            lines.append("")
            lines.append("")
            lines.append("id: {}".format(noteUUID))
            addStandardLines(lines, notebookUUID, '')
            lines.extend([
                "latitude: 47.6",
                "longitude: -122",
                "altitude: 0.000",
            ])
            lines.append("markup_language: 1")
            lines.append("is_todo: 0")
            lines.append("todo_due: 0")
            lines.append("todo_completed: 0")
            lines.append("source_url: ")
            lines.append("author: ")
            lines.append("is_conflict: 0")
            lines.append("conflict_original_id: ")
            lines.append("source: joplin-desktop")
            lines.append("source_application: net.cozic.joplin-desktop")
            lines.append("application_data: ")
            lines.append("order: 0")
            lines.append("type_: 1")
            f.write("\n".join(lines))

    return words


def mergeTagGroups(tagsListGroups):
    """
        Merge a list of tag maps into a single tag map.
        This stage merges the results of several possibly-parallel
        tasks.
    """
    tags = {}
    for tagGroup in tagsListGroups:
        for tag in tagGroup:
            if tag in tags:
                tags[tag].extend(tagGroup[tag])
            else:
                tags[tag] = tagGroup[tag]
    return tags

def writeTags(tags):
    """
        Creates files representing tags and links from tags to
        individual notes.
    """
    for tag in tags:
        noteUUIDs = tags[tag]
        tagUUID = getUUID()
        writeTag(tag, tagUUID)

        for noteUUID in noteUUIDs:
            writeLink(tagUUID, noteUUID)

def writeTag(tag, tagUUID):
    """
        Writes a single tag with the given name and uuid.
        Does not write links between tags and notes.
    """
    with open(mdPath(tagUUID), 'w') as f:
        lines = [
            tag,
            "",
            "id: {}".format(tagUUID)
        ]
        # Empty parent ID, no newline sep (we're adding it)
        addStandardLines(lines, "", "")
        lines.append("type_: 5")

        f.write("\n".join(lines))

def writeLink(tagUUID, noteUUID):
    """
        Writes out a link between a tag and a note.
        (Associates the note with the tag)
    """
    linkUUID = getUUID()
    with open(mdPath(linkUUID), 'w') as f:
        lines = [
            "id: {}".format(linkUUID),
            "note_id: {}".format(noteUUID),
            "tag_id: {}".format(tagUUID),
        ]
        addStandardLines(lines, None, '')
        lines.append("type_: 6")
        f.write("\n".join(lines))

def setupOut():
    print("Setting up", OUT_DIR)
    os.mkdir(OUT_DIR + "/resources")

if __name__ == "__main__":
    setupOut()
    tags = [ processBook(path) for path in BOOK_PATHS ]
    tags = mergeTagGroups(tags)
    print("Writing tags and links.")
    writeTags(tags)
    print("Done.")

Using it

  1. Download several large text files and store them in a directory. I downloaded books from Project Gutenberg for this. (These books should be public domain in the USA — I don't know about other regions).

  2. Replace BOOKS_DIR="/path/to/some/books" with the path to the directory containing the downloaded books.

  3. Create an out/ folder in the same directory as the script

  4. Run the script!

    • The out/ folder needs to be empty at the beginning of each run of the script.
    • I use a shell script for this:
      •  #!/bin/sh
         rm -rf out/
         mkdir out
         python3 joplinify.py
        
  5. Import the directory from Joplin (select joplin export directory)

    • Depending on the size and number of books, this can take a long time.

Note

When I ran this, it took roughly 3 hours for the import to finish. Notes are somewhat short and there are roughly 11,000 tags.

10 Likes

The CLI app has a "testing" command that can automatically create a number of notes, tags and notebooks too. Although it only creates empty notes I think. The advantage of running it from the app is that you bypass the API so it's probably faster.

1 Like