commit 694acf8599712396e4097138edb08c23756044c5 Author: Cassowary Date: Mon Sep 25 12:16:34 2023 -0700 Initial chekin post-discontinuity. diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..e373895 --- /dev/null +++ b/.gitignore @@ -0,0 +1,149 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc +auto-save-list +tramp +.\#* + +# Org-mode +.org-id-locations +*_archive + +# flymake-mode +*_flymake.* + +# eshell files +/eshell/history +/eshell/lastdir + +# elpa packages +/elpa/ + +# reftex files +*.rel + +# AUCTeX auto folder +/auto/ + +# cask packages +.cask/ +dist/ + +# Flycheck +flycheck_*.el + +# server auth directory +/server/ + +# projectiles files +.projectile + +# directory configuration +.dir-locals.el diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..cc948be --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Cas Rusnov + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..3e854e2 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include heckweasel/defaults/*.yaml \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..2ce7586 --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +# Heckweasel # + +Heckweasel is a site compiler engineered like a metadata-based CMS with a template rendering system. Underneath it uses +Jinja2 templates to provide programmability, and a structured metadata system, along with processors to convert +user-friendly files such as Markdown and RST into HTML with templates. diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..ccb1d2e --- /dev/null +++ b/TODO.md @@ -0,0 +1,24 @@ +# TODO # + +* Smart CSS things (fill in the processors) +* Project global defines, parameters. +* pre- and post-scripts that will be run from __main__, either some shipped with heckweasel or project-level. +* Library of template modules? ATOM et al. +* Some off the shelf website templates and a template manager. +* Live refreshing server thing which maps a heckweasel tree into a web server's memory and updates on change. +* https://github.com/Python-Markdown/markdown/wiki/Third-Party-Extensions + * add markdown_link_attr_modifier extension + * add figureAltCaption extension + * add qrcode extension +* Add support to define macros or whatever for Jinja, or to include generic stanzas in any output so adding macros won't mean repeatedly including them. + +* It'd be good to generate a dependency tree and only recompile things based on changes, like makefile-like behavior. + +* Fragments which would be blobs of mechanics like rss feed, thumbnail links, etc. They would be virtual files and other changes to processing + chains and project contents. `python -mheckweasel --fragment=rss,config=foo.meta` etc. + +* Run commands as part of processing chains + +* Project level processing chain overrides in the .meta or whatever. + + diff --git a/docs/index.html b/docs/index.html new file mode 100644 index 0000000..a64a156 --- /dev/null +++ b/docs/index.html @@ -0,0 +1,65 @@ +

HECKWEASEL documentation!

+ +

Welcome to the index for HECKWEASEL Documentation. In this directory you’ll find a bunch of files but this is the introduction you need to understanding the way heckweasel works and how to use it.

+ +

Introduction Part 2: What the hyeck is Heckweasel?!

+ +

Heckweasel is a website compiler framework. Primarily it allows the creation of web site using a collection of flat files which are in a maintainable form, producing the less maintainable formats that web browsers use.

+ +

The flat files in a heckweasel project are just a directory of files like any other. There is a default directory structure for projects but that isn’t important right now.

+ +

Heckweasel projects generally take the form of a collection of one or more templates and a collection of one or more files that are filled into the templates. Pervasively, heckweasel draws a distinction between the contents of a web page and the template it gets put into. You can think of the template, as generally used by heckweasel, as a sort of picture frame into which your content is placed. The content itself may be implemented as one of several popular formats such as Markdown and HTML. Also of note is that there are sort of two routes from heckweeasel input to heckweasel output, one route is through the template system and the other route merely copies the input to the output.

+ +

Another important detail about heckweasel is metadata. Every item in the heckweasel project (thus, every file in the heckweasel project directory) has a collection of metadata associated with it, such as its file name, creation time, and other objective information, but also any arbitrary information about it such as its title, a short description, thumbnails or whatever. It’s also important to note that the content of a file counts as metadata, and is stored the same way inside of heckweasel’s way of looking at the files. Metadata is stored with the file as filename.meta and directories contain metadata in the file called .meta. Metadata is also inherited! So setting a template in a directory’s metadata will apply to all of the contents of that directory. Metadata is all in a JSON format called JStyleSon, which is JSON except you can have comments in it. All of these metadata are accessable from the templates, which leads to…

+ +

The final important detail about heckweasel is that it, at is core, uses a programmable template system called Jinja. Jinja allows a lot, and I mean a lot of flexability in the way that the output is produced, giving complete programmability. This allows templates (and pages, for that matter) to contain programmable outcomes such as showing a list of all blog entries (each of which would be a separate file), or making a thumbnail gallery from a collection of pictures, or generating an RSS feed from all of the contents of the site. This also allows the website design to be broken into parts such that commonly-used patterns can be merely included in the file rather than being written repeatedly (although normally this function done with the page templates).

+ +

Just the very Basic Heckweasel Project

+ +

So with all of that said, the most basic possible heckweasel project that is actually functional would be something like a page template, and a content file called index. Heckweasel operates on an input directory and outputs to an output directory. This is admittedly not a normal use case since it doesn’t benifit much from the elaborate system underneath, but it gets the idea across.

+ +

So you have your project directory mywebsite; inside we can have the directories source and publish, and various files, and well here’s a picture:

+ + + + +

To explain the various files:

+ +

.meta

+ +

This file is a JSON file containing project-wide metadata. Usually this would be metadata that applies, by default, to all files. Some things that affect the way Heckweasel processes files would be template which would set the default template to put content into and templates which would set the directory to look for templates in. By custom we also may want to set the title, author and other things like that which we may want to fill into the output files. We also put things like the eventual published address for the site (site_root).

+ +

Example .meta file:

+ +

```json

+ +

{ + “site_root”: “https://website.me”, + “author”: “Very Nice Person”, + “title”: “My Website” +}

+ +

```

+ +

default.jinja

diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..501f624 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,151 @@ +# HECKWEASEL documentation! + +Welcome to the index for HECKWEASEL Documentation. In this directory you'll find a bunch of files but this is the introduction you need to understanding the way heckweasel works and how to use it. You wouldn't web a site. + +## Introduction: TL;DR + +Heckweasel compiles a set of files into a website. + +There is your website **template**, separate files that are the **content** of your website (such as a blog post, an image, etc), and json files that are the **metadata** for each content file. These get compiled together into static web pages. + +There's a lot more to it, and it is entirely programmable, but basically that's it. + + +## Introduction: What the hyeck is Heckweasel?! + +Heckweasel is a website compiler framework. Primarily it allows the creation of web site using a collection of flat files which are in a maintainable form, producing the less maintainable formats that web browsers use. + +The flat files in a heckweasel project are just a directory of files like any other. There is a default directory structure for projects but that isn't important right now. + +Heckweasel projects generally take the form of a collection of one or more templates and a collection of one or more files that are filled into the templates. Pervasively, heckweasel draws a distinction between the contents of a web page and the template it gets put into. You can think of the template, as generally used by heckweasel, as a sort of picture frame into which your content is placed. The content itself may be implemented as one of several popular formats such as Markdown and HTML. Also of note is that there are sort of two routes from heckweeasel input to heckweasel output, one route is through the template system and the other route merely copies the input to the output. + +Another important detail about heckweasel is metadata. Every item in the heckweasel project (thus, every file in the heckweasel project directory) has a collection of *metadata* associated with it, such as its file name, creation time, and other objective information, but also any arbitrary information about it such as its title, a short description, thumbnails or whatever. It's also important to note that the **content** of a file counts as metadata, and is stored the same way inside of heckweasel's way of looking at the files. Metadata is stored with the file as *filename*.meta and directories contain metadata in the file called .meta. Metadata is also inherited! So setting a template in a directory's metadata will apply to all of the contents of that directory. Metadata is all in a JSON format called JStyleSon, which is JSON except you can have comments in it. All of these metadata are accessable from the templates, which leads to... + +The final important detail about heckweasel is that it, at is core, uses a programmable template system called Jinja. Jinja allows a lot, and I mean a *lot* of flexability in the way that the output is produced, giving complete programmability. This allows templates (and pages, for that matter) to contain programmable outcomes such as showing a list of all blog entries (each of which would be a separate file), or making a thumbnail gallery from a collection of pictures, or generating an RSS feed from all of the contents of the site. This also allows the website design to be broken into parts such that commonly-used patterns can be merely included in the file rather than being written repeatedly (although normally this function done with the page templates). + + +## Glossary + +- **template** + - A -link-Jinja2 file which gets filled in with your content +- **content** + - The content which gets filled into templates to produce pages +- **metadata** + - Extra variables or values associated with content, which can be used to modify the way template works and do other tricks + + + +## Just the very Basic Heckweasel Project + +So with all of that said, the most basic possible heckweasel project that is actually functional would be something like a page template, and a content file called index. Heckweasel operates on an input directory and outputs to an output directory. This is admittedly not a normal use case since it doesn't benifit much from the elaborate system underneath, but it gets the idea across. + +So you have your project directory `mywebsite`; inside we can have the directories `source` and `publish`, and various files, and well here's a picture: + +- __mywebsite__ + - __source__ + - *.meta* + - __templates__ + - *default.jinja* + - *index.md* + - *index.md.meta* + - __publish__ + + +To explain the various files: + + +### *.meta* + +This file is a JSON file containing project-wide metadata. Usually this would be metadata that applies, by default, to all files. Some things that affect the way Heckweasel processes files would be `template` which would set the default template to put content into and `templates` which would set the directory to look for templates in. By custom we also may want to set the title, author and other things like that which we may want to fill into the output files. We also put things like the eventual published address for the site (`site_root`). + +Example .meta file: + + +```json +{ + "site_root": "https://website.me", + "author": "Very Nice Person", + "title": "My Website" +} + +``` + +### *default.jinja* + +This is the default template. Heckweasel will look for `templates/default.jinja` unless another templates directory and template are specified. Jinja templates might output any kind of text file you want, but usually we put HTML inside them. Here's an example `default.jinja` that makes a barely functional web page but we'll explain more later: + +```jinja2 + + + +{{ metadata.title }} + + +{{content}} + + +``` + +The main thing to notice is that this is a very simple HTML file. It does the bare minimum to render in a browser. The next thing to notice are all of the `{}` things. Those are Jinja commands. A `{{}}` containing a name will fill that name from the variables set in the Jinja environment. In Heckweasel the main two things are `content` and `metadata`. `Metadata` contains the metadata set via the `.meta` and other sources as discussed above. The new thing here is `content`, which is the *contents* of the page! As discussed above, the contents and template are considered separately, and so the page contents are filled into the template where the `{{content}}` tag is! You can also see that the title of the page is set based on the page's `title` metadata. We'll discuss this more in the next section. + +Another interesting thing is, any styling that should be applied to the whole website, to a particular page type, or whatever goes in templates. For example this is where you'd include the site-wide CSS sheet for this site, and it would apply that style to all the pages (we'll discuss this more in a future section). + + +### *index.md* + +This is the contents of the page that will eventually become `index.html` when heckweasel is done with it. Notice it is `.md` which means markdown, a user-friendly markup format - heckweasel will convert this to an HTML fragment and fill in the template's `content` with the result, producing `index.html`. This is how the magic happens! The contents of this file could be something as simple as: + +```markdown +# Welcome! + +Hello this is my website! Hi! +``` + +### *index.md.meta* + +This contains the metadata specific to `index.md`. It can be left out if there isn't any specific metadata, but it's useful to make even an empty one for future reference. An example use of this is to set different title for each page. + +Example: + +```json +{ + "title": "Welcome to my Home Page" +} +``` + +### Rolling it all together + +Given the above tree, from the command line in the `mywebsite` directory, to compile this would be as simple as : + +```bash +$ python -mheckweasel source publish +``` + +This would produce, in the `publish` directory, `index.html`, which would have contents like: + +```html + + + +Welcome to my Home Page + + +

Welcome!

+

This is my website! Hi!

+ + +``` + +Notice how the result of converting `index.md` into HTML is inserted into the template where `{{content}}` was, and the value of `title` from `index.md.meta` is inserted where `{{metadata.title}}`. While `index.md` inherited the top-level `title` metadata from the top `.meta`, its own `index.md.meta` file overrode it. Neat! + +The `publish` directory is ready to be serverd by a small HTTP server, placed in a web content directory, or whatever. We'll discuss that in a future section about hosting your Heckweasel site. + +## Getting (very slightly) more advanced with Heckweasel + +Now that we see how a project and its parts fit together we can make our little website slightly more interesting. + +### Styling your Web Site + +As we alluded to above, the templates are where style information generally lives. + + diff --git a/docs/metadata.md b/docs/metadata.md new file mode 100755 index 0000000..6cee143 --- /dev/null +++ b/docs/metadata.md @@ -0,0 +1,77 @@ +# METADATA # + +## FORMAT ## + +Metadata is stored as a JSON file which allows C-likecomments. + +Metadata is loaded from the top down, so each parent from the root can impart metadata on children. Children can explitily nullify parent metadata by +assigning it to undefined. + +## STORAGE ## + +On-disk meatdata is stored as a file along side the non-metadata file with the extension '.meta', for example the file 'foo.thtml' would have a metadata file as 'foo.thtml.meta'. Metadata for directories (which gets applied to all contents of that directory) is stored in .meta in the directory. + +## DEFAULT KEYS AND VALUES ## + +All files define the following keys by default: + +relpath +: The relative path to the root of the site, useful for prepending to image `src=` and other resource paths such as CSS files and fonts in order to maintain locally viewable output. +file_name +: The local path of the file +file_path +: The full path to the file from the root +dir +: The directory to the path from root for this file +os-path +: The native OS path to this file +guessed-type +: The guessed mime-type of the file +stat +: A tree of stat() values in a dictionary, without the ST_ prefix, and with lowercase keys. +templates +: The path to the template files. +uuid +: A UUID for this file based on its path and a specified `uuid-oid-root` metadata +build-time +: The time stamp for the build time + +Files can also explicitly override these which are set to empty defaults: + +mime-type +: Either the specified mime-type or guessed type if undefined. +template +: The full path to the template file +dir-template +: The full path to the filesystem template +title +: A title for this object derived from the template, metadata or other sources. +summary +: A summary of the file contents. +description +: A description of file contents. + +Trees have some metadata that projects should probably override (generally in their top-level .meta): + +uuid-oid-root +: A string added to the beginning of the path that identifies this site, used for deriving OID UUIDs. +author +: The full name of the author of this site (should also be overridden per-file if necessary). +author_email +: The email of the author of this site (see above) +site_root +: The full URL for the root of this web site used for links and whatnot, with ending slash. + +Special Keys that can be defined, these change the processing in predictable ways: + +type +: Define that the file that this metadata is applied to as a specific type from the type mapping table. Useful values are `passthrough` and `templatable` with obvious outcomes. +wildcard_metadata +: Define a dictionary of file globs (patterns which match files such as `*.txt`), with the value being a dictionary of additional metadata to apply to the matched files. This is generally +defined at the top level of the project to make certain file patterns treated as special without having to give them their own metadata. + + +## CACHING STRATEGY ## + +The tree is traversed from the top down, each node in the tree is stat(). The mtime walue is compared to the mtime stored in the cache dict for that node. If it is newer, the metadata +is loaded again, and the tree continues to traverse. diff --git a/docs/patterns.md b/docs/patterns.md new file mode 100644 index 0000000..2f95ded --- /dev/null +++ b/docs/patterns.md @@ -0,0 +1,5 @@ +# Patterns for Site Design # + +These are some simple patterns for things commonly needed in websites of various kinds. + +## diff --git a/docs/project-layout.md b/docs/project-layout.md new file mode 100644 index 0000000..8d535cd --- /dev/null +++ b/docs/project-layout.md @@ -0,0 +1,71 @@ +# Project Layout # + +It is recommended that in general your project for Heckweasel site be layed out like: +``` +project_top/ + Makefile - Convenient for building your site + src/ - All "source" pages are contained in here. + .meta - Top-level default metadata is set here + index.cont - The content part of the index page + index.cont.meta - A metadata json file for the index, specifically. + templates/ - Templates go in here + default.jinja2 - Default template that will be used if none are specified + publish/ - The path the build process will create, where the post-processed files go. +``` + + +## Makefile ## + +Makefile is suggested, but not essential, for encapsulating your build commands to produce your +site. Something as simple as: + +``` +build: src/templates/* src/* + python -mheckweasel src publish +``` + +## src/ ## + +This is the top level path that all of your templates, page fragments, images, etc. will be stored. This is basically the "source code" for your site. + +## src/.meta ## + +This is the top level metadata that is used as the default for all subsidiary metadata. It is in JSON format (with JS style comments). See for more information. + +Example .meta file: + +``` +{ + "title": "My Website", // this is the default title applied if none are specified + "author": "Super Web Dude", + "site_root": "http://example.com", + "uuid-oid-root": "example.com-", // this is used to generate UUIDs +} +``` + +## src/templates/ ## + +Templates are all stored here, as this is the search path for Jinja. + +## templates/default.jinja2 ## + +If a page specifies a `template` metadata key, the named template is used, however, if not this template is used. Generally speaking this is a complete HTML file, with the `{{ content }}` template string placed where the content of subsidiary pages will be embedded. + +A simple default.jinja2 example: + +``` + + + +{{ metadata.title }} + + +{{content}} + + +``` + + +## publish/ ## + +This is arbitrary, and will be created by heckweasel at build time, but it will be the root path that should be published to your web server. diff --git a/docs/quickstart.md b/docs/quickstart.md new file mode 100644 index 0000000..dc1df1b --- /dev/null +++ b/docs/quickstart.md @@ -0,0 +1,10 @@ +# I am just writing a simple site with a couple of pages + + +# I am interested in the technicalities of template development + + +# I am interested in the technicalities of deployment + + + diff --git a/docs/templatefunctions.md b/docs/templatefunctions.md new file mode 100644 index 0000000..50cd0d7 --- /dev/null +++ b/docs/templatefunctions.md @@ -0,0 +1,113 @@ +# Template Functions # + +These are functions exposed to the templates which perform various useful actions for the site designer. + +## get_file_list ## + +Return a list of file names based on a wildcard glob, matched against the root of the project. + +Prototype: `get_file_list(file_glob, sort_order, reverse, limit) -> [files]` + +Arguments: +* file_glob: A standard file glob, for example `*.txt` matches all files that end in `.txt` in the root of the project. (default: `*`) +* sort_order: A string of either `file_path`, `file_name`, `ctime`, `mtime`, `size` and `ext` (default: `ctime`) +* reverse: whether the sort is reversed (default: False) +* limit: The number of entries to return from the top of the list, 0 for unlimited (default: `0`) + +Returns: +* A list of file names. + +## get_file_name ## + +Return the filename that will result from processing the specified file based on the processors that it will be passed through. + +Prototype: `get_file_name(file) -> outfile` + +Arguments: +* file: The name of a file, with path, from root. + +Returns: +* outfile: The name of the file, with path, that will result from processing. + +## get_file_content ## + +Return the rendered content of specified file. Caution: Can result in infinite loops if two templates include each other. + +Prototype: `get_file_content(file) -> content` + +Arguments: +* file: The name of the input file, with path, from root. + +Returns: +* content: the contents that result from passing the specified file through its processors. + +## get_raw ## + +Return the raw contents of a source file. It is specifically not passed through any processing. + +Prototype: `get_raw(file) -> content` + +Arguments: +* file: The name of the input file, with path, from root. + +Returns: +* content: the raw contents of the input file + +## get_file_metadata ## + +Return the metadata tree associated with a particular file. + +Prototype: `get_file_metadata(file) -> metadata` + +Arguments: +* file: the name of an input file, with path, from root + +Returns: +* metadata: A dictionary of metadata loaded from the file tree. + +## get_time_iso8601 ## + +Return the date/time stamp in ISO 8601 format for a given time_t timestamp for UTC. + +Prototype: `get_time_iso8601(timestamp) -> timestamp` + +Arguments: +* timestamp: A time_t integer or float, in seconds since Jan 1 1970. + +Returns: +* timestamp: A string in ISO8601 format of the date and timestamp, in the UTC timezone. + +## get_date_iso8601 ## + +Return the date stamp in ISO 8601 format for a given time_t timestamp for UTC. + +Prototype: `get_date_iso8601(timestamp) -> timestamp` + +Arguments: +* timestamp: A time_t integer or float, in seconds since Jan 1 1970. + +Returns: +* timestamp: A string in ISO8601 format of the date stamp, in the UTC timezone. + +## pygments_get_css ## + +Return a blob of CSS produced from Pygments for a given `style`. + +Prototype: `pygments_get_css(style) -> css` + +Arguments: +* style (optional): A style identifier for the Pygments' HTMLFormatter. + +Returns: +* css: A string of styles as returned by Pygments' HTMLFormatter. + +## pygments_markup_contents_html ## + +Format a code fragment with Pygments + +Prototype: `pygments_markup_contents_html(input, filetype, style) -> html` + +Arguments: +* input: A string containing the code to format (either literal, or imported with get_raw()). +* filetype: A string describing which lexer to use. +* style (optional) A style identifier for Pygments' HTMLFormatter. diff --git a/heckweasel/__init__.py b/heckweasel/__init__.py new file mode 100644 index 0000000..a71c5c7 --- /dev/null +++ b/heckweasel/__init__.py @@ -0,0 +1 @@ +__version__ = '0.7.0' diff --git a/heckweasel/__main__.py b/heckweasel/__main__.py new file mode 100644 index 0000000..a93a418 --- /dev/null +++ b/heckweasel/__main__.py @@ -0,0 +1,157 @@ +# iterate source tree +# create directors in target tree +# for each item: +# run processor(s) on item, each processor could be in a chain or a branch +# Processors also provide filename munging +# output target based on processor output + +import argparse +import logging +import os +import shutil +import sys +import time +from typing import Dict, List, cast + +from .metadata import MetaTree +from .processchain import ProcessorChains +from .processors.processors import PassthroughException +from .pygments import pygments_get_css, pygments_markup_contents_html +from .template_tools import ( + date_iso8601, + file_content, + file_list, + file_list_hier, + file_json, + file_metadata, + file_name, + file_raw, + time_iso8601, +) +from .utils import deep_merge_dicts + +logger = logging.getLogger() + + +def setup_logging(verbose: bool = False) -> None: + pass + + +def parse_var(varspec: str) -> List: + if (not ('=' in varspec)): + return [varspec, True] + return list(varspec.split('=', 2)) + + +def get_args(args: List[str]) -> argparse.Namespace: + parser = argparse.ArgumentParser("Compile a Heckweasel directory into an output directory.") + + parser.add_argument("root", help="The root of the heckweasel directory to process.") + parser.add_argument("output", help="The output directory to export post-compiled files to.") + + parser.add_argument( + "-c", "--clean", help="Remove the target tree before proceeding (by renaming to .bak).", action="store_true" + ) + parser.add_argument("-s", "--safe", help="Abort if the target directory already exists.", action="store_true") + parser.add_argument("-f", "--follow-links", help="Follow symbolic links in the input tree.", action="store_true") + parser.add_argument("-t", "--template", help="The template directory (default: root/templates)", default=None) + parser.add_argument("-d", "--dry-run", help="Perform a dry-run.", action="store_true") + parser.add_argument("-v", "--verbose", help="Output verbosely.", action="store_true") + parser.add_argument("--processors", help="Specify a path to a processor configuration file.", default=None) + parser.add_argument( + "-D", "--define", help="Add a variable to the metadata.", nargs="+", action="extend", type=parse_var) + result = parser.parse_args(args) + # validate arguments + if not os.path.isdir(result.root): + raise FileNotFoundError("can't find root folder {}".format(result.root)) + + if not result.template: + result.template = os.path.join(result.root, "templates") + result.excludes = [result.template] + + return result + + +def main() -> int: + try: + args = get_args(sys.argv[1:]) + except FileNotFoundError as ex: + print("error finding arguments: {}".format(ex)) + return 1 + setup_logging(args.verbose) + if os.path.exists(args.output) and args.clean: + bak = "{}.bak-{}".format(args.output, int(time.time())) + print("cleaning target {} -> {}".format(args.output, bak)) + os.rename(args.output, bak) + + process_chains = ProcessorChains(args.processors) + + default_metadata = { + "templates": args.template, + "template": "default.jinja2", + "dir-template": "default-dir.jinja2", + "filters": {}, + "build-time": time.time(), + "uuid-oid-root": "heckweasel", + "summary": "", + "description": "", + "author": "", + "author_email": "", + } + if args.define: + for var in args.define: + default_metadata[var[0]] = var[1] + meta_tree = MetaTree(args.root, default_metadata) + file_list_cache = cast(Dict, {}) + file_cont_cache = cast(Dict, {}) + file_name_cache = cast(Dict, {}) + file_raw_cache = cast(Dict, {}) + flist = file_list(args.root, file_list_cache) + default_metadata["globals"] = { + "get_file_list": flist, + "get_hier": file_list_hier(args.root, flist), + "get_file_name": file_name(args.root, meta_tree, process_chains, file_name_cache), + "get_file_content": file_content(args.root, meta_tree, process_chains, file_cont_cache), + "get_json": file_json(args.root), + "get_raw": file_raw(args.root, file_raw_cache), + "get_file_metadata": file_metadata(meta_tree), + "get_time_iso8601": time_iso8601("UTC"), + "get_date_iso8601": date_iso8601("UTC"), + "pygments_get_css": pygments_get_css, + "pygments_markup_contents_html": pygments_markup_contents_html, + "merge_dicts": deep_merge_dicts, + } + + for root, _, files in os.walk(args.root, followlinks=args.follow_links): + workroot = os.path.relpath(root, args.root) + if workroot == ".": + workroot = "" + target_dir = os.path.join(args.output, workroot) + print("mkdir -> {}".format(target_dir)) + if not args.dry_run: + try: + os.mkdir(target_dir) + except FileExistsError: + if args.safe: + print("error, target directory exists, aborting") + return 1 + for f in files: + # fixme global generic filters + if f.endswith(".meta") or f.endswith("~"): + continue + metadata = meta_tree.get_metadata(os.path.join(workroot, f)) + chain = process_chains.get_chain_for_filename(os.path.join(root, f), ctx=metadata) + print("process {} -> {} -> {}".format(os.path.join(root, f), repr(chain), os.path.join(target_dir, chain.output_filename))) + if not args.dry_run: + try: + with open(os.path.join(target_dir, chain.output_filename), "w") as outfile: + for line in chain.output: + outfile.write(line) + except PassthroughException: + shutil.copyfile(os.path.join(root, f), os.path.join(target_dir, chain.output_filename)) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/heckweasel/defaults/chains.yaml b/heckweasel/defaults/chains.yaml new file mode 100644 index 0000000..459eae0 --- /dev/null +++ b/heckweasel/defaults/chains.yaml @@ -0,0 +1,111 @@ +# Default: output == input +default: + extension: default + chain: + - passthrough + +# Any object that needs jinja scripts but no other explicit processing +templatable: + extension: null + chain: + - jinja2 + +# Any object that needs jinja and to be embedded in a parent template +tembed: + extension: null + chain: + - jinja2 + - jinja2_page_embed + +# Markdown, BBCode and RST are first run through the templater, and then +# they are processed into HTML, and finally embedded in a page template. +markdown: + extension: + - md + chain: + - jinja2 + - process_md + - jinja2_page_embed +bbcode: + extension: + - bb + - pp + chain: + - jinja2 + - process_pp + - jinja2_page_embed +# FIXME implement RST processor +# restructured: +# extension: +# - rst +# chain: +# - jinja2 +# - process_rst +# - jinja2_page_embed + +# # JSON and YAML are split, passed through a pretty printer, and then output +# FIXME implement split chain processor, implement processor arguments +# json: +# extension: +# - json +# chain: +# - split (passthrough) +# - pp_json +# yaml: +# extension: +# - yml +# - yaml +# chain: +# - split (passthrough) +# - pp_yaml + +# Template-html is first passed through the templater, and then embedded +# in a page template +template-html: + extension: + - thtml + - cont + chain: + - jinja2 + - jinja2_page_embed + +# # Smart CSS are simply converted to CSS. +# sass: +# extension: +# - sass +# - scss +# chain: +# - process_sass +# less: +# extension: +# - less +# chain: +# - process_less + +# stylus: +# extension: +# - styl +# chain: +# - process_styl + +# # Images are processed into thumbnails and sized in addition to being retained as their original +# FIXME implement split chain processor, implement processor arguments, +# image: +# extension: +# - jpg +# - jpeg +# - png +# chain: +# - split (image_bigthumb) +# - split (image_smallthumb) +# - passthrough + +# image_bigthumb: +# extension: +# chain: +# - smart_resize (big) + +# image_smallthumb: +# extension: +# chain: +# - smart_resize (small) diff --git a/heckweasel/metadata.py b/heckweasel/metadata.py new file mode 100644 index 0000000..f92e534 --- /dev/null +++ b/heckweasel/metadata.py @@ -0,0 +1,153 @@ +"""Constructs a tree-like object containing the metadata for a given path, and caches said metadata.""" + +import fnmatch +import logging +import mimetypes +import os +import uuid +from typing import Any, Dict, List, Optional, Tuple, Union, cast + +import jstyleson + +from .utils import guess_mime + +# setup mimetypes with some extra ones +mimetypes.init() +mimetypes.add_type("text/html", "thtml") +mimetypes.add_type("text/html", "cont") + +logger = logging.getLogger(__name__) + + +class MetaCacheMiss(Exception): + """Raised on cache miss.""" + + +class MetaCache: + """This class provides an in-memory cache for metadata tree.""" + + def __init__(self, max_age: float = 200.0): + """Initialize the cache. + + Arguments: + max_age (int): the number of seconds to age-out cache items + + """ + self._max_age = max_age + self._cache: Dict[str, Tuple[float, Any]] = {} + + def get(self, key: str, new_time_stamp: float) -> Any: + """Get an item from the cache. + + Arguments: + key (str): the cache key to retieve + new_time_stamp (int): The time to use to compare the stored time with + + Returns: + :obj:misc: The previously stored value. + + Raises: + MetaCacheMiss: on missing key, or on aged out + + """ + if key not in self._cache: + raise MetaCacheMiss("no item for key {}".format(key)) + + if self._cache[key][0] + self._max_age <= new_time_stamp: + return self._cache[key][1] + + raise MetaCacheMiss("cache expired for key {}".format(key)) + + def put(self, key: str, value: Union[Dict, List, int, str, object], time_stamp: float) -> None: + """Put an item into the cache. + + Arguments: + key (str): the key to store the cache item under + value (:obj:misc): the value to store + time_stamp (float): the time stamp to store the item under + + """ + self._cache[key] = (time_stamp, value) + + +class MetaTree: + """This provides an interface to loading and caching tree metadata for a given directory tree.""" + + def __init__(self, root: str, default_metadata: Optional[Dict] = None): + """Initialize the metadata tree object. + + Arguments: + root (str): The path to the root of the file tree to operate on. + default_metadata (dict, optional): The default metadata to apply to the tree + + """ + self._cache = MetaCache() + if default_metadata is None: + default_metadata = {} + self._default_metadata = default_metadata + if root[-1] != "/": + root += "/" + self._root = root + + def get_metadata(self, rel_path: str) -> Dict: + """Retrieve the metadata for a given path + + The general procedure is to iterate the tree, at each level + load .meta (JSON formatted dictionary) for that level, and + then finally load the path.meta, and merge these dictionaries + in descendant order. + + Arguments: + rel_path (str): The path to retrieve the metadata for (relative to root) + + Returns: + dict: A dictionary of metadata for that path tree. + + """ + metablob = dict(self._default_metadata) + # iterate path components from root to target path + comps = [self._root] + rel_path.split("/") + fullpath = "" + ospath = os.path.join(self._root, rel_path) + for pth in comps: + fullpath = os.path.join(fullpath, pth) + st = os.stat(fullpath) + + if os.path.isdir(fullpath): + cachekey = os.path.join(fullpath, ".meta") + else: + cachekey = fullpath + ".meta" + meta = cast(Dict, {}) + try: + st_meta = os.stat(cachekey) + meta = self._cache.get(cachekey, st_meta.st_mtime) + except FileNotFoundError: + st_meta = None # type: ignore + except MetaCacheMiss: + meta = {} + + if not meta and st_meta: + meta = jstyleson.load(open(cachekey, "r")) + self._cache.put(cachekey, meta, st_meta.st_mtime) + + if fullpath == ospath and "wildcard_metadata" in metablob: + for wild in metablob["wildcard_metadata"]: + if fnmatch.fnmatch(pth, wild[0]): + metablob.update(wild[1]) + + metablob.update(meta) + + # return final dict + metablob["dir"], metablob["file_name"] = os.path.split(rel_path) + metablob["file_path"] = rel_path + metablob["relpath"] = os.path.relpath("/", "/" + metablob["dir"]) + metablob["uuid"] = uuid.uuid3(uuid.NAMESPACE_OID, metablob["uuid-oid-root"] + ospath) + metablob["os-path"], _ = os.path.split(fullpath) + metablob["guessed-type"] = guess_mime(ospath) + if "mime-type" not in metablob: + metablob["mime-type"] = metablob["guessed-type"] + metablob["stat"] = {} + for stk in ("st_mtime", "st_ctime", "st_atime", "st_mode", "st_size", "st_ino"): + metablob["stat"][stk.replace("st_", "")] = getattr(st, stk) + + return metablob diff --git a/heckweasel/processchain.py b/heckweasel/processchain.py new file mode 100644 index 0000000..cc1e98c --- /dev/null +++ b/heckweasel/processchain.py @@ -0,0 +1,182 @@ +"""Interface for chains of processors""" + +import os +import os.path +import random +from typing import Any, Dict, Iterable, List, Optional, Type, cast + +import yaml + +from .processors.processors import Processor + + +class ProcessorChain: + """This implements a wrapper for an arbitrary set of processors and an associated file stream.""" + + def __init__( + self, + processors: List[Processor], + file_name: str, + file_data: Iterable[str], + file_type: str, + ctx: Optional[Dict] = None, + ): + """Initialize the processing stream. + + Arguments: + processors (list): A list of processor objects. + file_data (Iterable): An iterable from which to retrieve the input + file_type (str): the specified file type for consumer information. + + """ + self._processors = processors + self._file_data = file_data + self._file_type = file_type + self._file_name = file_name + self._ctx: Dict = {} + if ctx is not None: + self._ctx = cast(Dict, ctx) + + @property + def output(self) -> Iterable: + """Return an iterable for the output of the process chain + + Returns: + :obj:'iterable': the iterable + + """ + prev = self._file_data + for processor in self._processors: + if processor: + prev = processor.process(prev, self._ctx) + + return prev + + @property + def output_mime(self) -> str: + """Return the post-processed MIME value from the processing chain + + Returns: + str: the mime type + + """ + fname = self._file_name + for processor in self._processors: + fname = processor.mime_type(fname, self._ctx) + return fname + + @property + def output_ext(self) -> str: + """Return the post-processed extension from the processing chain + + Returns: + str: the extension + """ + fname = self._file_name + for processor in self._processors: + fname = processor.extension(fname, self._ctx) + return fname + + @property + def output_filename(self) -> str: + """Return the post-processed filename from the processing chain + + Returns: + str: the new filename + + """ + fname = os.path.basename(self._file_name) + for processor in self._processors: + fname = processor.filename(fname, self._ctx) + return fname + + def __repr__(self) -> str: + return "[" + ",".join([x.__class__.__name__ for x in self._processors]) + "]" + + +class ProcessorChains: + """Load a configuration for processor chains, and provide ability to process the chains given a particular input + file. + """ + + def __init__(self, config: Optional[str] = None): + """Initialize, with a specified configuration file + + Arguments: + config (str, optional): The path to a yaml formatted configuration file. + + """ + if config is None: # pragma: no coverage + config = os.path.join(os.path.dirname(__file__), "defaults", "chains.yaml") + + self.chainconfig = yaml.load(open(config, "r")) + self.extensionmap: Dict[str, Any] = {} + self.processors: Dict[str, Type[Processor]] = {} + for ch, conf in self.chainconfig.items(): + if conf["extension"] == "default": + self.default = ch + else: + if conf["extension"]: + for ex in conf["extension"]: + if ex in self.extensionmap or ex is None: + # log an error or except or something we'll just override for now. + pass + self.extensionmap[ex] = ch + for pr in conf["chain"]: + if pr in self.processors: + continue + processor_module = __import__("processors", globals(), locals(), [pr], 1) + self.processors[pr] = processor_module.__dict__[pr].processor + + def get_chain_for_filename(self, filename: str, ctx: Optional[Dict] = None) -> ProcessorChain: + """Get the ProcessorChain, as configured for a given file by extension. + + Arguments: + filename (str): The name of the file to get a chain for. + + Returns: + ProcessorChain: the constructed processor chain. + """ + r = filename.rsplit(".", 1) + ftype = "default" + if r: + ftype = r[-1] + if ctx and "pragma" in ctx: + if "no-proc" in ctx["pragma"]: + ftype = "default" + + if ctx and "type" in ctx: + ftype = ctx["type"] + return self.get_chain_for_file(open(filename, "r"), ftype, filename, ctx) + + def get_chain_for_file( + self, file_obj: Iterable, file_ext: str, file_name: Optional[str] = None, ctx: Optional[Dict] = None + ) -> ProcessorChain: + """Get the ProcessorChain for a given iterable object based on the specified file type + + Arguments: + file_obj (:obj:`iterable`): The input file stream + file_ext (str): The type (extension) of the input stream + + Returns: + ProcessorChain: the constructed processor chain. + + """ + if file_ext not in self.extensionmap or not self.extensionmap[file_ext]: + if file_ext in self.chainconfig: + file_type = file_ext + else: + file_type = "default" + else: + file_type = self.extensionmap[file_ext] + + if not (bool(file_name)): + file_name = hex(random.randint(0, 65536)) + + return ProcessorChain( + [self.processors[x]() for x in self.chainconfig[file_type]["chain"]], + cast(str, file_name), + file_obj, + file_type, + ctx, + ) diff --git a/heckweasel/processors/__init__.py b/heckweasel/processors/__init__.py new file mode 100644 index 0000000..d058ff6 --- /dev/null +++ b/heckweasel/processors/__init__.py @@ -0,0 +1 @@ +# processors metadata here diff --git a/heckweasel/processors/jinja2.py b/heckweasel/processors/jinja2.py new file mode 100644 index 0000000..52535b6 --- /dev/null +++ b/heckweasel/processors/jinja2.py @@ -0,0 +1,31 @@ +"""Define a Jinja2 Processor which applies programmable templating to the input stream.""" + +from typing import Dict, Iterable, Optional, cast + +from jinja2 import Environment, FileSystemLoader + +from .passthrough import PassThrough + + +class Jinja2(PassThrough): + """Pass the input stream through Jinja2 for scritable templating.""" + + def process(self, input_file: Iterable, ctx: Optional[Dict] = None) -> Iterable: + """Return an iterable object of the post-processed file. + + Arguments: + input_file (iterable): An input stream + ctx (dict, optional): A context object generated from the processor configuration + + + Returns: + iterable: The post-processed output stream + """ + ctx = cast(Dict, ctx) + template_env = Environment(loader=FileSystemLoader(ctx["templates"]), extensions=["jinja2.ext.do"]) + template_env.globals.update(ctx["globals"]) + template_env.filters.update(ctx["filters"]) + tmpl = template_env.from_string("".join([x for x in input_file])) + return tmpl.render(metadata=ctx) + +processor = Jinja2 diff --git a/heckweasel/processors/jinja2_page_embed.py b/heckweasel/processors/jinja2_page_embed.py new file mode 100644 index 0000000..3be143c --- /dev/null +++ b/heckweasel/processors/jinja2_page_embed.py @@ -0,0 +1,74 @@ +"""Define a Jinja2 processor which embeds the (presumably HTML) input stream into a Page Template + as defined in the ctx metadata (the ``content`` variable is assigned to the input stream and + the target template is rendered).""" + +import os +from typing import Dict, Iterable, Optional, cast + +from jinja2 import Environment, FileSystemLoader + +from .processors import Processor + + +class Jinja2PageEmbed(Processor): + """Embed input stream as ``content`` variable in page template defined in context key ``template``.""" + + def filename(self, oldname: str, ctx: Optional[Dict] = None) -> str: + """Return the filename of the post-processed file. + + Arguments: + oldname (str): the previous name for the file. + ctx (dict, optional): A context object generated from the processor configuration + + Returns: + str: the new name for the file + + """ + return os.path.splitext(oldname)[0] + "." + self.extension(oldname, ctx) + + def mime_type(self, oldname: str, ctx: Optional[Dict] = None) -> str: + """Return the mimetype of the post-processed file. + + Arguments: + oldname (str): the input filename + ctx (dict, optional): A context object generated from the processor configuration + + Returns: + str: the new mimetype of the file after processing + + """ + return ctx.get("mime", "text/html") + + def process(self, input_file: Iterable, ctx: Optional[Dict] = None) -> Iterable: + """Return an iterable object of the post-processed file. + + Arguments: + input_file (iterable): An input stream + ctx (dict, optional): A context object generated from the processor configuration + + Returns: + iterable: The post-processed output stream + """ + ctx = cast(Dict, ctx) + template_env = Environment(loader=FileSystemLoader(ctx["templates"]), extensions=["jinja2.ext.do"]) + template_env.globals.update(ctx["globals"]) + template_env.filters.update(ctx["filters"]) + tmpl = template_env.get_template(ctx["template"]) + content = "".join([x for x in input_file]) + return tmpl.render(content=content, metadata=ctx) + + def extension(self, oldname: str, ctx: Optional[Dict] = None) -> str: + """Return the mimetype of the post-processed file. + + Arguments: + oldname (str): the input filename + ctx (dict, optional): A context object generated from the processor configuration + + Returns: + str: the new extension of the file after processing + + """ + return ctx.get("extension", "html") + + +processor = Jinja2PageEmbed diff --git a/heckweasel/processors/passthrough.py b/heckweasel/processors/passthrough.py new file mode 100644 index 0000000..c3f34ae --- /dev/null +++ b/heckweasel/processors/passthrough.py @@ -0,0 +1,68 @@ +"""Passthrough progcessor which takes input and returns it.""" + +import os +from typing import Dict, Iterable, Optional, cast + +from ..utils import guess_mime +from .processors import PassthroughException, Processor + + +class PassThrough(Processor): + """A simple passthrough processor that takes input and sends it to output.""" + + def filename(self, oldname: str, ctx: Optional[Dict] = None) -> str: + """Return the filename of the post-processed file. + + Arguments: + oldname (str): the previous name for the file. + ctx (dict, optional): A context object generated from the processor configuration + + Returns: + str: the new name for the file + + """ + return oldname + + def mime_type(self, oldname: str, ctx: Optional[Dict] = None) -> str: + """Return the mimetype of the post-processed file. + + Arguments: + oldname (str): the input filename + ctx (dict, optional): A context object generated from the processor configuration + + Returns: + str: the new mimetype of the file after processing + + """ + result = cast(str, guess_mime(oldname)) + if result == "directory": + result = "DIR" + return result + + def process(self, input_file: Iterable, ctx: Optional[Dict] = None) -> Iterable: + """Return an iterable object of the post-processed file. + + Arguments: + input_file (iterable): An input stream + ctx (dict, optional): A context object generated from the processor configuration + + Returns: + iterable: The post-processed output stream + """ + raise PassthroughException("passthrough") + + def extension(self, oldname: str, ctx: Optional[Dict] = None) -> str: + """Return the mimetype of the post-processed file. + + Arguments: + oldname (str): the input filename + ctx (dict, optional): A context object generated from the processor configuration + + Returns: + str: the new extension of the file after processing + + """ + return os.path.splitext(oldname)[-1] + + +processor = PassThrough diff --git a/heckweasel/processors/process_less.py b/heckweasel/processors/process_less.py new file mode 100644 index 0000000..a9f542f --- /dev/null +++ b/heckweasel/processors/process_less.py @@ -0,0 +1 @@ +processor = None diff --git a/heckweasel/processors/process_md.py b/heckweasel/processors/process_md.py new file mode 100644 index 0000000..30a1b01 --- /dev/null +++ b/heckweasel/processors/process_md.py @@ -0,0 +1,68 @@ +"""Convert an MD stream into an HTML stream""" + +import io +import os +from typing import Dict, Iterable, Optional + +import markdown + +from .processors import Processor + + +class MarkdownProcessor(Processor): + """Convert an MD stream into an HTML stream""" + + def filename(self, oldname: str, ctx: Optional[Dict] = None) -> str: + """Return the filename of the post-processed file. + + Arguments: + oldname (str): the previous name for the file. + ctx (dict, optional): A context object generated from the processor configuration + + Returns: + str: the new name for the file + + """ + return os.path.splitext(oldname)[0] + ".html" + + def mime_type(self, oldname: str, ctx: Optional[Dict] = None) -> str: + """Return the mimetype of the post-processed file. + + Arguments: + oldname (str): the input filename + ctx (dict, optional): A context object generated from the processor configuration + + Returns: + str: the new mimetype of the file after processing + + """ + return "text/html" + + def extension(self, oldname: str, ctx: Optional[Dict] = None) -> str: + """Return the mimetype of the post-processed file. + + Arguments: + oldname (str): the input filename + ctx (dict, optional): A context object generated from the processor configuration + + Returns: + str: the new extension of the file after processing + + """ + return "html" + + def process(self, input_file: Iterable, ctx: Optional[Dict] = None) -> Iterable: + """Return an iterable object of the post-processed file. + + Arguments: + input_file (iterable): An input stream + ctx (dict, optional): A context object generated from the processor configuration + + Returns: + iterable: The post-processed output stream + """ + md = u"".join([x for x in input_file]) + return io.StringIO(markdown.markdown(md, extensions=["extra", "admonition", "wikilinks"])) + + +processor = MarkdownProcessor # pylint: disable=invalid-name diff --git a/heckweasel/processors/process_pp.py b/heckweasel/processors/process_pp.py new file mode 100644 index 0000000..a9f542f --- /dev/null +++ b/heckweasel/processors/process_pp.py @@ -0,0 +1 @@ +processor = None diff --git a/heckweasel/processors/process_sass.py b/heckweasel/processors/process_sass.py new file mode 100644 index 0000000..a9f542f --- /dev/null +++ b/heckweasel/processors/process_sass.py @@ -0,0 +1 @@ +processor = None diff --git a/heckweasel/processors/process_styl.py b/heckweasel/processors/process_styl.py new file mode 100644 index 0000000..a9f542f --- /dev/null +++ b/heckweasel/processors/process_styl.py @@ -0,0 +1 @@ +processor = None diff --git a/heckweasel/processors/processors.py b/heckweasel/processors/processors.py new file mode 100644 index 0000000..f3312e7 --- /dev/null +++ b/heckweasel/processors/processors.py @@ -0,0 +1,69 @@ +import abc +from typing import Dict, Iterable, Optional + + +class PassthroughException(Exception): + """Raised when the processor would like the file to pass through unchanged.""" + + +class ProcessorException(Exception): # pragma: no cover + """A base exception class to be used by processor objects.""" + + +class Processor(abc.ABC): # pragma: no cover + def __init__(self, *args, **kwargs): + """Initialize the class.""" + + @abc.abstractmethod + def filename(self, oldname: str, ctx: Optional[Dict] = None) -> str: + """Return the filename of the post-processed file. + + Arguments: + oldname (str): the previous name for the file. + ctx (dict, optional): A context object generated from the processor configuration + + Returns: + str: the new name for the file + + """ + + @abc.abstractmethod + def mime_type(self, oldname: str, ctx: Optional[Dict] = None) -> str: + """Return the mimetype of the post-processed file. + + Arguments: + oldname (str): the input filename + ctx (dict, optional): A context object generated from the processor configuration + + Returns: + str: the new mimetype of the file after processing + + """ + + @abc.abstractmethod + def extension(self, oldname: str, ctx: Optional[Dict] = None) -> str: + """Return the mimetype of the post-processed file. + + Arguments: + oldname (str): the input filename + ctx (dict, optional): A context object generated from the processor configuration + + Returns: + str: the new extension of the file after processing + + """ + + @abc.abstractmethod + def process(self, input_file: Iterable, ctx: Optional[Dict] = None) -> Iterable: + """Return an iterable object of the post-processed file. + + Arguments: + input_file (iterable): An input stream + ctx (dict, optional): A context object generated from the processor configuration + + Returns: + iterable: The post-processed output stream + """ + + def repr(self) -> str: + return self.__class__.__name__ diff --git a/heckweasel/pygments.py b/heckweasel/pygments.py new file mode 100644 index 0000000..4a38b76 --- /dev/null +++ b/heckweasel/pygments.py @@ -0,0 +1,36 @@ +"""Map Pygments into the Template API for inclusion in outputs.""" +from typing import Optional, cast + +import pygments +import pygments.formatters +import pygments.lexers +import pygments.styles +import pygments.util + + +def pygments_markup_contents_html(input_text: str, file_type: str, style: Optional[str] = None) -> str: + """Format input string with Pygments and return HTML.""" + + if style is None: + style = "default" + pyst = pygments.styles.get_style_by_name(style) + formatter = pygments.formatters.get_formatter_by_name("html", style=pyst) + try: + lexer = pygments.lexers.get_lexer_for_filename(file_type) + except pygments.util.ClassNotFound: + try: + lexer = pygments.lexers.get_lexer_by_name(file_type) + except pygments.util.ClassNotFound: + lexer = pygments.lexers.get_lexer_by_mimetype(file_type) + + return pygments.highlight(input_text, lexer, formatter) + + +def pygments_get_css(style: Optional[str] = None) -> str: + """Return the CSS styles associated with a particular style definition.""" + + if style is None: + style = "default" + pyst = pygments.styles.get_style_by_name(style) + formatter = pygments.formatters.get_formatter_by_name("html", style=pyst) + return formatter.get_style_defs() diff --git a/heckweasel/template_tools.py b/heckweasel/template_tools.py new file mode 100644 index 0000000..328145e --- /dev/null +++ b/heckweasel/template_tools.py @@ -0,0 +1,145 @@ +import copy +import datetime +import glob +import itertools +import os +from typing import Callable, Dict, Iterable, List, Union, cast, Tuple + +import jstyleson + +import pytz + +from .metadata import MetaTree +from .processchain import ProcessorChains +from .utils import deep_merge_dicts + + +def file_list(root: str, listcache: Dict) -> Callable: + def get_file_list( + path_glob: Union[str, List[str], Tuple[str]], + *, + sort_order: str = "ctime", + reverse: bool = False, + limit: int = 0) -> Iterable: + stattable = cast(List, []) + if isinstance(path_glob, str): + path_glob = [path_glob] + for pglob in path_glob: + if pglob in listcache: + stattable.extend(listcache[pglob]) + else: + for fil in glob.glob(os.path.join(root, pglob)): + if os.path.isdir(fil): + continue + if fil.endswith(".meta") or fil.endswith("~"): + continue + st = os.stat(fil) + stattable.append( + { + "file_path": os.path.relpath(fil, root), + "file_name": os.path.split(fil)[-1], + "mtime": st.st_mtime, + "ctime": st.st_ctime, + "size": st.st_size, + "ext": os.path.splitext(fil)[1], + } + ) + listcache[pglob] = stattable + ret = sorted(stattable, key=lambda x: x[sort_order], reverse=reverse) + if limit > 0: + return itertools.islice(ret, limit) + return ret + + return get_file_list + + +def file_list_hier(root: str, flist: Callable) -> Callable: + """Return a callable which, given a directory, will walk the directory and return the files within + it that match the glob passed.""" + + def get_file_list_hier(path: str, glob: str, *, sort_order: str = "ctime", reverse: bool = False) -> Iterable: + output = [] + + for pth in os.walk(os.path.join(root, path)): + output.extend( + flist( + os.path.join(os.path.relpath(os.path.realpath(pth[0]), root), glob), + sort_order=sort_order, + reverse=reverse, + ) + ) + + return output + + return get_file_list_hier + + +def file_name(root: str, metatree: MetaTree, processor_chains: ProcessorChains, namecache: Dict) -> Callable: + def get_file_name(file_name: str) -> Dict: + if file_name in namecache: + return namecache[file_name] + metadata = metatree.get_metadata(file_name) + chain = processor_chains.get_chain_for_filename(os.path.join(root, file_name), ctx=metadata) + namecache[file_name] = chain.output_filename + return namecache[file_name] + + return get_file_name + + +def file_raw(root: str, contcache: Dict) -> Callable: + def get_raw(file_name: str) -> str: + if file_name in contcache: + return contcache[file_name] + with open(os.path.join(root, file_name), "r", encoding="utf-8") as f: + return f.read() + + return get_raw + + +def file_json(root: str) -> Callable: + def get_json(file_name: str, parent: Dict = None) -> Dict: + outd = {} + if parent is not None: + outd = copy.deepcopy(parent) + + with open(os.path.join(root, file_name), "r", encoding="utf-8") as f: + return deep_merge_dicts(outd, jstyleson.load(f)) + + return get_json + + +def file_content(root: str, metatree: MetaTree, processor_chains: ProcessorChains, contcache: Dict) -> Callable: + def get_file_content(file_name: str) -> Iterable: + if file_name in contcache: + return contcache[file_name] + metadata = metatree.get_metadata(file_name) + chain = processor_chains.get_chain_for_filename(os.path.join(root, file_name), ctx=metadata) + contcache[file_name] = chain.output + return str(chain.output) + + return get_file_content + + +def file_metadata(metatree: MetaTree) -> Callable: + def get_file_metadata(file_name: str) -> Dict: + return metatree.get_metadata(file_name) + + return get_file_metadata + + +def time_iso8601(timezone: str) -> Callable: + tz = pytz.timezone(timezone) + + def get_time_iso8601(time_t: Union[int, float]) -> str: + return datetime.datetime.fromtimestamp(time_t, tz).isoformat("T") + + return get_time_iso8601 + + +def date_iso8601(timezone: str) -> Callable: + tz = pytz.timezone(timezone) + + def get_date_iso8601(time_t: Union[int, float]) -> str: + return datetime.datetime.fromtimestamp(time_t, tz).strftime("%Y-%m-%d") + + return get_date_iso8601 diff --git a/heckweasel/tests/unit/__init__.py b/heckweasel/tests/unit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/heckweasel/tests/unit/test_processchain.py b/heckweasel/tests/unit/test_processchain.py new file mode 100644 index 0000000..1342fb4 --- /dev/null +++ b/heckweasel/tests/unit/test_processchain.py @@ -0,0 +1,6 @@ +class TestProcessChain: + def test_process_chain(self): + pass + + def test_processor_chain(self): + pass diff --git a/heckweasel/utils.py b/heckweasel/utils.py new file mode 100644 index 0000000..d12c490 --- /dev/null +++ b/heckweasel/utils.py @@ -0,0 +1,72 @@ +from typing import Dict, Optional +import copy +import mimetypes +import os + + +def merge_dicts(dict_a: Dict, dict_b: Dict) -> Dict: + """Merge two dictionaries (shallow). + + Arguments: + dict_a (dict): The dictionary to use as the base. + dict_b (dict): The dictionary to update the values with. + + Returns: + dict: A new merged dictionary. + + """ + dict_z = dict_a.copy() + dict_z.update(dict_b) + return dict_z + + +def deep_merge_dicts(dict_a: Dict, dict_b: Dict, _path=None, cpy=False) -> Dict: + """Merge two dictionaries (deep). + https://stackoverflow.com/questions/7204805/how-to-merge-dictionaries-of-dictionaries/7205107#7205107 + + Arguments: + dict_a (dict): The dictionary to use as the base. + dict_b (dict): The dictionary to update the values with. + _path (list): internal use. + + Returns: + dict: A new merged dictionary. + + """ + if cpy: + dict_a = copy.deepcopy(dict_a) + if _path is None: + _path = [] + for key in dict_b: + if key in dict_a: + if isinstance(dict_a[key], dict) and isinstance(dict_b[key], dict): + deep_merge_dicts(dict_a[key], dict_b[key], _path + [str(key)]) + elif dict_a[key] == dict_b[key]: + pass # same leaf value + else: + dict_a[key] = copy.deepcopy(dict_b[key]) + else: + dict_a[key] = dict_b[key] + return dict_a + + +def guess_mime(path: str) -> Optional[str]: + """Guess the mime type for a given path. + + Arguments: + root (str): the root path of the file tree + path (str): the sub-path within the file tree + + Returns: + str: the guessed mime-type + + """ + mtypes = mimetypes.guess_type(path) + ftype = None + if os.path.isdir(path): + ftype = "directory" + elif os.access(path, os.F_OK) and mtypes[0]: + ftype = mtypes[0] + else: + ftype = "application/octet-stream" + return ftype diff --git a/importwp.py b/importwp.py new file mode 100644 index 0000000..9ade3c4 --- /dev/null +++ b/importwp.py @@ -0,0 +1,162 @@ +"""Convert a Wordpress XML dump into to a (mostly working) heckweasel tree.""" + +import argparse +import datetime +import json +import os +import sys +from urllib.parse import urlparse +from xml.etree.ElementTree import ElementTree + +import requests + +FILE_PATTERN = "{postdate}-{postname}.thtml" + + +def parse_args(args): + parser = argparse.ArgumentParser("importwp.py") + + parser.add_argument("input", help="The input file.") + parser.add_argument("out_dir", help="Output root directory.", default='.') + parser.add_argument("--fetch-attachments", help="Fetch all attachments referred to in file.", action="store_true", dest='fetch_attachments') + parser.add_argument("--attachment-dir", help="Subdirectory to place attachments in.", default="attachments", dest='attachment_dir') + parser.add_argument("--post-dir", help="Subdirectory to place posts in.", default="posts", dest='post_dir') + parser.add_argument("--page-dir", help="Subdirectory to place pages in.", default="", dest='page_dir') + + result = parser.parse_args(args) + result.post_dir = os.path.join(result.out_dir, result.post_dir) + result.page_dir = os.path.join(result.out_dir, result.page_dir) + result.attachment_dir = os.path.join(result.out_dir, result.attachment_dir) + + return result + + +def parse_input(xmlpath): + tree = ElementTree() + + tree_root = tree.parse(source=xmlpath) + posts = {} + attachments = {} + pages = {} + + for node in tree_root.find("channel"): + if node.tag == "item": + post_type = node.find("{http://wordpress.org/export/1.2/}post_type") + if post_type is not None: + status = node.find("{http://wordpress.org/export/1.2/}status") + if status is not None and status.text == "draft": + continue + content = node.find("{http://purl.org/rss/1.0/modules/content/}encoded") + title = node.find("title") + pubdate = node.find("pubDate") + description = node.find("description") + post_name = node.find("{http://wordpress.org/export/1.2/}post_name") + categories = node.findall("category") + post_id = node.find("{http://wordpress.org/export/1.2/}post_id") + post_parent = node.find("{http://wordpress.org/export/1.2/}post_parent") + if post_type.text == "post": + # found a post! + posts[post_id.text] = {'content':content, + 'title':title, + 'pubdate':pubdate, + 'description':description, + 'post_name':post_name, + 'categories':categories, + 'post_parent':post_parent} + elif post_type.text == "attachment": + # attachment + att_url = node.find("{http://wordpress.org/export/1.2/}attachment_url") + + attachments[post_id.text] = {'content':content, + 'title':title, + 'pubdate':pubdate, + 'description':description, + 'post_name':post_name, + 'categories':categories, + 'post_parent':post_parent, + 'att_url':att_url,} + elif post_type.text == "page": + pages[post_id.text] = {'content':content, + 'title':title, + 'pubdate':pubdate, + 'description':description, + 'post_name':post_name, + 'categories':categories, + 'post_parent':post_parent} + + return posts, attachments, pages + +def fetch_attachment(attch, outdir): + url = attch['att_url'].text + p = urlparse(url) + filename = os.path.join(outdir, os.path.split(p.path)[-1]) + print("fetching attachment",url,"->",filename) + r = requests.get(url) + with open(filename, 'wb') as outf: + outf.write(r.content) + +def save_cont(post, outdir): + dt = datetime.datetime.strptime(post['pubdate'].text, "%a, %d %b %Y %H:%M:%S %z") + postdate = dt.strftime("%Y-%m-%d-%H%M%S") + filename = FILE_PATTERN.format(postdate=postdate, postname=post['post_name'].text) + print(post['title'].text, "->", filename) + with open(os.path.join(outdir, filename), "w") as outf: + outf.write(post['content'].text) + # handle attachments + + tags = [] + category = "" + for tg in post['categories']: + if "domain" in tg.attrib and tg.attrib["domain"] == "category": + category = tg.text + else: + tags.append(tg.text) + + with open(os.path.join(outdir, filename + ".meta"), "w") as outf: + metadata = { + "title": post['title'].text, + "description": post['description'].text, + "post_time": dt.timestamp(), + "featured": "", + "tags": tags, + "category": category, + } + json.dump(metadata, outf) + + +def main(): + args = parse_args(sys.argv[1:]) + try: + os.mkdir(args.out_dir) + except FileExistsError: + pass + + try: + os.mkdir(args.page_dir) + except FileExistsError: + pass + + try: + os.mkdir(args.post_dir) + except FileExistsError: + pass + + if args.fetch_attachments: + try: + os.mkdir(args.attachment_dir) + except FileExistsError: + pass + + posts, attachments, pages = parse_input(args.input) + + if args.fetch_attachments: + [fetch_attachment(post, args.attachment_dir) for post in attachments.values()] + + [save_cont(post, args.post_dir) for post in posts.values()] + [save_cont(page, args.page_dir) for page in pages.values()] + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..e69de29 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..4d08ac5 --- /dev/null +++ b/setup.py @@ -0,0 +1,62 @@ +"""Package configuration.""" +from setuptools import find_packages, setup + +from heckweasel import __version__ + +LONG_DESCRIPTION = """Heckweasel is a filesystem based static site generator.""" + +INSTALL_REQUIRES = ["yaml-1.3", "markdown", "jstyleson", "jinja2", "pygments"] + +# Extra dependencies +EXTRAS_REQUIRE = { + # Test dependencies + "tests": [ + "black", + "bandit>=1.1.0", + "flake8>=3.2.1", + "mypy>=0.470", + "prospector[with_everything]>=0.12.4", + "pytest-cov>=1.8.0", + "pytest-xdist>=1.15.0", + "pytest>=3.0.3", + "sphinx_rtd_theme>=0.1.6", + "sphinx-argparse>=0.1.15", + "Sphinx>=1.4.9", + ] +} + +SETUP_REQUIRES = ["pytest-runner>=2.7.1", "setuptools_scm>=1.15.0"] +setup( + author="Cassowary Rusnov", + author_email="alderconestudio@gmail.com", + classifiers=[ + "Development Status :: 1 - Pre-alpha", + "Environment :: Console", + "License :: OSI Approved :: MIT", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3 :: Only", + "Topic :: Software Development :: Libraries :: Python Modules", + ], + description="A filesystem-based website generator / CMS", + # entry_points={ + # 'console_scripts': [ + # 'cookbook = spicerack.cookbook:main', + # ], + # }, + include_package_data=True, + extras_require=EXTRAS_REQUIRE, + install_requires=INSTALL_REQUIRES, + keywords=["cms", "website", "compiler"], + license="MIT", + long_description=LONG_DESCRIPTION, + name="heckweasel", + packages=find_packages(exclude=["*.tests", "*.tests.*"]), + platforms=["GNU/Linux"], + setup_requires=SETUP_REQUIRES, + use_scm_version=True, + url="https://git.aldercone.studio/aldercone/heckweasel", + zip_safe=False, + version=__version__, +) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..cf924c5 --- /dev/null +++ b/tox.ini @@ -0,0 +1,24 @@ +[tox] +envlist=py{36,37,38,39}-{code-quality, unit} #, py37-sphinx +skipsdist = true + +[testenv] +setenv = + LANG = en_US.UTF-8 +deps = .[tests] +commands = + unit: py.test --strict --cov-report=term-missing --cov=heckweasel heckweasel/tests/unit {posargs} + code-quality: flake8 heckweasel + code-quality: black -l 120 --check heckweasel + code-quality: - prospector -A + code-quality: - mypy --ignore-missing-imports heckweasel + # sphinx: python setup.py build_sphinx -b html + # sphinx: python setup.py build_sphinx -b man +basepython = + py36: python3.6 + py37: python3.7 + py38: python3.8 + py39: python3.9 + +[flake8] +max-line-length = 120