Background
My personal joplin notebook only has a few dozen tags, but I want to be able to test performance issues referenced in discussion posts like this one.
Script for generating a large Joplin notebook
Below is a (very messy) python
script that creates a Joplin export directory with many, many tags.
Do not import the directory generated by this script unless you have
- Backed up all of your Joplin settings and data
- Turned off sync.
Show code
#!/bin/python3
import os, uuid, datetime, re, random
##
## A rather messy script that, given a directory of text files,
## creates a large Joplin notebook.
##
## This script is published in the hope that others may find it useful.
## (Consider it licensed under the MIT public license).
##
# Path to a directory with large text files!
# I fetched several books (as .txt files) from Project Gutenberg
# and stored them in a directory
BOOKS_DIR = "/path/to/some/books"
OUT_DIR = os.path.dirname(__file__) + "/out"
BOOK_PATHS = [ BOOKS_DIR + "/" + p for p in os.listdir(BOOKS_DIR) ]
WORD_REGEX = re.compile(r'(?:[^a-zA-Z]|^)([a-zA-Z]+)(?:[^a-zA-Z]|$)')
# Probability that a given word will be ignored (when
# reading a portion of the input data and considering whether to
# track a word).
CHANCE_OF_IGNORE_WORD = 0.7
def timestamp():
return datetime.datetime.now().isoformat()
def addStandardLines(lines, parentId, newlineChar='\n'):
now = timestamp()
content = [
"created_time: {}Z".format(now),
"updated_time: {}Z".format(now),
"user_created_time: {}Z".format(now),
"user_updated_time: {}Z".format(now),
"encryption_cipher_text: ",
"encryption_applied: 0",
]
if parentId != None:
content.append(
"parent_id: {}".format(parentId)
)
content.extend([
"is_shared: 0",
"share_id: ",
"master_key_id: ",
])
lines.extend([ l + newlineChar for l in content ])
def getUUID():
return str(uuid.uuid4()).replace('-', '')
def mdPath(uuid):
return "{}/{}.md".format(OUT_DIR, uuid)
# Process book
def processBook(path):
notebookUUID = getUUID()
print("Processing", path)
# Write the notebook spec file.
with open(mdPath(notebookUUID), 'w') as f:
lines = [
os.path.basename(path) + "\n",
"\n",
"id: {}\n".format(notebookUUID),
]
addStandardLines(lines, "")
lines.append("icon: \n")
lines.append("type_: 2")
f.writelines(lines)
# Write chunks of the books as .md files.
chunks = ["First paragraph.\n\n"]
links = {}
words = {}
paragraphNo = 1
isFirst = False
with open(path, 'r') as f:
for line in f:
if line.strip() == "" and not isFirst:
paragraphNo += 1
chunks.append("Paragraph " + str(paragraphNo) + "\n\n")
isFirst = True
else:
isFirst = (line.strip() == "")
chunks[-1] += line.rstrip() + " "
for chunk in chunks:
noteUUID = getUUID()
for word in re.findall(WORD_REGEX, chunk):
if random.random() > CHANCE_OF_IGNORE_WORD:
word = word.lower()
if not (word in words):
words[word] = []
words[word].append(noteUUID)
print("Writing", mdPath(noteUUID))
with open(mdPath(noteUUID), 'w') as f:
lines = [chunk]
lines.append("")
lines.append("")
lines.append("Which has length, {}".format(len(chunk)))
lines.append("")
lines.append("")
lines.append("id: {}".format(noteUUID))
addStandardLines(lines, notebookUUID, '')
lines.extend([
"latitude: 47.6",
"longitude: -122",
"altitude: 0.000",
])
lines.append("markup_language: 1")
lines.append("is_todo: 0")
lines.append("todo_due: 0")
lines.append("todo_completed: 0")
lines.append("source_url: ")
lines.append("author: ")
lines.append("is_conflict: 0")
lines.append("conflict_original_id: ")
lines.append("source: joplin-desktop")
lines.append("source_application: net.cozic.joplin-desktop")
lines.append("application_data: ")
lines.append("order: 0")
lines.append("type_: 1")
f.write("\n".join(lines))
return words
def mergeTagGroups(tagsListGroups):
"""
Merge a list of tag maps into a single tag map.
This stage merges the results of several possibly-parallel
tasks.
"""
tags = {}
for tagGroup in tagsListGroups:
for tag in tagGroup:
if tag in tags:
tags[tag].extend(tagGroup[tag])
else:
tags[tag] = tagGroup[tag]
return tags
def writeTags(tags):
"""
Creates files representing tags and links from tags to
individual notes.
"""
for tag in tags:
noteUUIDs = tags[tag]
tagUUID = getUUID()
writeTag(tag, tagUUID)
for noteUUID in noteUUIDs:
writeLink(tagUUID, noteUUID)
def writeTag(tag, tagUUID):
"""
Writes a single tag with the given name and uuid.
Does not write links between tags and notes.
"""
with open(mdPath(tagUUID), 'w') as f:
lines = [
tag,
"",
"id: {}".format(tagUUID)
]
# Empty parent ID, no newline sep (we're adding it)
addStandardLines(lines, "", "")
lines.append("type_: 5")
f.write("\n".join(lines))
def writeLink(tagUUID, noteUUID):
"""
Writes out a link between a tag and a note.
(Associates the note with the tag)
"""
linkUUID = getUUID()
with open(mdPath(linkUUID), 'w') as f:
lines = [
"id: {}".format(linkUUID),
"note_id: {}".format(noteUUID),
"tag_id: {}".format(tagUUID),
]
addStandardLines(lines, None, '')
lines.append("type_: 6")
f.write("\n".join(lines))
def setupOut():
print("Setting up", OUT_DIR)
os.mkdir(OUT_DIR + "/resources")
if __name__ == "__main__":
setupOut()
tags = [ processBook(path) for path in BOOK_PATHS ]
tags = mergeTagGroups(tags)
print("Writing tags and links.")
writeTags(tags)
print("Done.")
Using it
-
Download several large text files and store them in a directory. I downloaded books from Project Gutenberg for this. (These books should be public domain in the USA — I don't know about other regions).
-
Replace
BOOKS_DIR="/path/to/some/books"
with the path to the directory containing the downloaded books. -
Create an
out/
folder in the same directory as the script -
Run the script!
- The
out/
folder needs to be empty at the beginning of each run of the script. - I use a shell script for this:
-
#!/bin/sh rm -rf out/ mkdir out python3 joplinify.py
-
- The
-
Import the directory from Joplin (select
joplin export directory
)- Depending on the size and number of books, this can take a long time.
Note
When I ran this, it took roughly 3 hours for the import to finish. Notes are somewhat short and there are roughly 11,000 tags.