From b760b1d22d950e70aac5911c871f959d80affcfa Mon Sep 17 00:00:00 2001
From: navanchauhan <navanchauhan@gmail.com>
Date: Mon, 7 Nov 2022 23:36:11 -0700
Subject: add blog post

---
 Content/posts/2022-11-07-a-new-method-to-blog.md | 34 ++++++++++++++++++++++++
 poetry.lock                                      | 17 +++++++-----
 2 files changed, 45 insertions(+), 6 deletions(-)
 create mode 100644 Content/posts/2022-11-07-a-new-method-to-blog.md

diff --git a/Content/posts/2022-11-07-a-new-method-to-blog.md b/Content/posts/2022-11-07-a-new-method-to-blog.md
new file mode 100644
index 0000000..d419a0f
--- /dev/null
+++ b/Content/posts/2022-11-07-a-new-method-to-blog.md
@@ -0,0 +1,34 @@
+---
+date: 2022-11-07 23:29
+description: Writing posts in markdown using pen and paper
+tags: Python, OCR, Microsoft Azure
+---
+
+# A new method to blog
+
+[Paper Website](https://paperwebsite.com) is a service that lets you build a website with just pen and paper. I am going to try and replicate the process.
+
+## The Plan
+The continuity feature on macOS + iOS lets you scan PDFs directly from your iPhone. I want to be able to scan these pages and automatically run an Automator script that takes the PDF and OCRs the text. Then I can further clean the text and convert from markdown.
+
+## Challenges
+
+I quickly realised that the OCR software I planned on using could not detect my shitty handwriting accurately. I tried using ABBY Finereader, Prizmo and OCRMyPDF. (Abby Finereader and Prizmo support being automated by Automator).
+
+Now, I could either write neater, or use an external API like Microsoft Azure
+
+## Solution
+
+### OCR
+
+In the PDFs, all the scans are saved as images on a page. I extract the image and then send it to Azure's API. 
+
+### Paragraph Breaks
+The recognised text had multiple lines breaking in the middle of the sentence, Therefore, I use what is called a [pilcrow](https://en.wikipedia.org/wiki/Pilcrow) to specify paragraph breaks. But, rather than trying to draw the normal pilcrow, I just use the HTML entity `&#182;` which is the pilcrow character. 
+
+## Where is the code?
+I created a [GitHub Gist](https://gist.github.com/navanchauhan/5fc602b1e023b60a66bc63bd4eecd4f8) for a sample Python script to take the PDF and print the text 
+
+A more complete version with Auomator scripts and an entire publishing pipeline will be available as a GitHub and Gitea repo soon.
+
+** In Part 2, I will discuss some more features ** 
diff --git a/poetry.lock b/poetry.lock
index ac8c4bd..64be20b 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -14,12 +14,17 @@ i18n = ["Babel (>=2.7)"]
 
 [[package]]
 name = "markdown2"
-version = "2.4.3"
+version = "2.4.6"
 description = "A fast and complete Python implementation of Markdown"
 category = "main"
 optional = false
 python-versions = ">=3.5, <4"
 
+[package.extras]
+all = ["pygments (>=2.7.3)", "wavedrom"]
+code_syntax_highlighting = ["pygments (>=2.7.3)"]
+wavedrom = ["wavedrom"]
+
 [[package]]
 name = "markupsafe"
 version = "2.1.1"
@@ -30,12 +35,15 @@ python-versions = ">=3.7"
 
 [[package]]
 name = "pygments"
-version = "2.12.0"
+version = "2.13.0"
 description = "Pygments is a syntax highlighting package written in Python."
 category = "main"
 optional = false
 python-versions = ">=3.6"
 
+[package.extras]
+plugins = ["importlib-metadata"]
+
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
@@ -89,7 +97,4 @@ markupsafe = [
     {file = "MarkupSafe-2.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247"},
     {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"},
 ]
-pygments = [
-    {file = "Pygments-2.12.0-py3-none-any.whl", hash = "sha256:dc9c10fb40944260f6ed4c688ece0cd2048414940f1cea51b8b226318411c519"},
-    {file = "Pygments-2.12.0.tar.gz", hash = "sha256:5eb116118f9612ff1ee89ac96437bb6b49e8f04d8a13b514ba26f620208e26eb"},
-]
+pygments = []
-- 
cgit v1.2.3