diff options
Diffstat (limited to 'bin')
| -rw-r--r-- | bin/Makefile | 18 | ||||
| -rwxr-xr-x | bin/cleaner.py | 62 | ||||
| -rwxr-xr-x | bin/export.py | 158 |
3 files changed, 238 insertions, 0 deletions
diff --git a/bin/Makefile b/bin/Makefile new file mode 100644 index 00000000..10ac0d1e --- /dev/null +++ b/bin/Makefile | |||
| @@ -0,0 +1,18 @@ | |||
| 1 | all: rebuild nikola build | ||
| 2 | |||
| 3 | rebuild: | ||
| 4 | rm -rf git dump | ||
| 5 | ./export.py | ||
| 6 | rm git/pages/updates.md | ||
| 7 | |||
| 8 | nikola: | ||
| 9 | venv/bin/nikola init -q git | ||
| 10 | cp site/conf.py git | ||
| 11 | cd git; nikola plugin -i localsearch | ||
| 12 | cd git; nikola theme -n ccc --parent=bootstrap4 | ||
| 13 | cp -r site/themes/ccc git/themes/ | ||
| 14 | cp -r site/data site/shortcodes git/ | ||
| 15 | cp -r site/index.rst site/index.en.rst site/search.html site/cpu site/themen git/pages/ | ||
| 16 | |||
| 17 | build: | ||
| 18 | cd git; nikola build --backend=sqlite3 | ||
diff --git a/bin/cleaner.py b/bin/cleaner.py new file mode 100755 index 00000000..d8c999da --- /dev/null +++ b/bin/cleaner.py | |||
| @@ -0,0 +1,62 @@ | |||
| 1 | #! venv/bin/python | ||
| 2 | import psycopg2 | ||
| 3 | import psycopg2.extras | ||
| 4 | import pathlib | ||
| 5 | import os.path | ||
| 6 | import datetime | ||
| 7 | import pypandoc | ||
| 8 | from bs4 import BeautifulSoup | ||
| 9 | from datetime import timedelta, datetime, tzinfo | ||
| 10 | #from langdetect import detect | ||
| 11 | import sys | ||
| 12 | import re | ||
| 13 | |||
| 14 | conn = psycopg2.connect(database="cccms_dev", user="postgres", password="", host="127.0.0.1") | ||
| 15 | cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) | ||
| 16 | |||
| 17 | cursor.execute("select * from page_translations") | ||
| 18 | page_translations = cursor.fetchall() | ||
| 19 | |||
| 20 | fo = open("orig.txt", "w") | ||
| 21 | fm = open("modi.txt", "w") | ||
| 22 | |||
| 23 | for translation in page_translations: | ||
| 24 | body = translation.get('body') | ||
| 25 | cursor.execute("select * from pages where id = %s", [translation['page_id']]) | ||
| 26 | page = cursor.fetchone() | ||
| 27 | |||
| 28 | if not body: | ||
| 29 | continue | ||
| 30 | |||
| 31 | if not '<p id=' in body: | ||
| 32 | continue | ||
| 33 | |||
| 34 | fo.write(body) | ||
| 35 | |||
| 36 | print( 'Needing fixup: {}'.format(page['id'])) | ||
| 37 | soup = BeautifulSoup(body, 'html5lib') | ||
| 38 | |||
| 39 | # https://stackoverflow.com/questions/10555932/how-do-i-use-beautifulsoup-to-replace-a-tag-with-its-contents | ||
| 40 | # for match in soup.findAll('div'): | ||
| 41 | # match.replaceWithChildren() | ||
| 42 | |||
| 43 | # https://stackoverflow.com/questions/33500888/how-to-remove-tags-that-have-no-content | ||
| 44 | [x.decompose() for x in soup.findAll(lambda tag: (not tag.contents or len(tag.get_text(strip=True)) <= 0) and not tag.name == 'br' )] | ||
| 45 | |||
| 46 | # for elm in soup.find_all('span', style=re.compile(r"text-decoration: underline;")): | ||
| 47 | # elm.name = 'u' | ||
| 48 | # del elm.attrs['style'] | ||
| 49 | |||
| 50 | # for elm in soup.find_all('span', style=re.compile(r"font-weight: bold;")): | ||
| 51 | # elm.name = 'b' | ||
| 52 | # del elm.attrs['style'] | ||
| 53 | |||
| 54 | # print(soup.prettify()) | ||
| 55 | |||
| 56 | for elm in soup.find_all('p', id=re.compile(r"magic")): | ||
| 57 | del elm.attrs | ||
| 58 | |||
| 59 | if soup.body and soup.body.children: | ||
| 60 | fm.write("".join([str(x) for x in soup.body.children])) | ||
| 61 | cursor.execute("update page_translations set body = %s where id = %s", ["".join([str(x) for x in soup.body.children]), translation['id']]) | ||
| 62 | conn.commit() | ||
diff --git a/bin/export.py b/bin/export.py new file mode 100755 index 00000000..26460e8f --- /dev/null +++ b/bin/export.py | |||
| @@ -0,0 +1,158 @@ | |||
| 1 | #! venv/bin/python | ||
| 2 | import psycopg2 | ||
| 3 | import psycopg2.extras | ||
| 4 | import pathlib | ||
| 5 | import os.path | ||
| 6 | import datetime | ||
| 7 | import pypandoc | ||
| 8 | import json | ||
| 9 | from datetime import timedelta, datetime, tzinfo | ||
| 10 | #from langdetect import detect | ||
| 11 | import sys | ||
| 12 | |||
| 13 | conn = psycopg2.connect(database="cccms_dev", user="postgres", password="", host="127.0.0.1") | ||
| 14 | cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) | ||
| 15 | lcursor = conn.cursor() | ||
| 16 | cursor.execute("select id, login, email, admin from users", []) | ||
| 17 | user = cursor.fetchall() | ||
| 18 | |||
| 19 | page_list = [] | ||
| 20 | redirect_list = {} | ||
| 21 | |||
| 22 | pathlib.Path(os.path.dirname('git/')).mkdir(parents=True, exist_ok=True) | ||
| 23 | os.system("git -C git/ init") | ||
| 24 | pathlib.Path(os.path.dirname('git/images/')).mkdir(parents=True, exist_ok=True) | ||
| 25 | pathlib.Path(os.path.dirname('git/files/')).mkdir(parents=True, exist_ok=True) | ||
| 26 | pathlib.Path(os.path.dirname('dump/images/')).mkdir(parents=True, exist_ok=True) | ||
| 27 | pathlib.Path(os.path.dirname('dump/files/')).mkdir(parents=True, exist_ok=True) | ||
| 28 | |||
| 29 | rev = 0 | ||
| 30 | cursor.execute("select id, name, upload_content_type, upload_file_name, upload_updated_at from assets order by id", []) | ||
| 31 | assets = cursor.fetchall() | ||
| 32 | for asset in assets: | ||
| 33 | if not asset.get('upload_file_name'): | ||
| 34 | continue | ||
| 35 | source = "/usr/local/www/cccms/public/system/uploads/{}/original/{}".format(asset['id'], asset['upload_file_name']) | ||
| 36 | destdir = 'files/' | ||
| 37 | if asset['upload_content_type'].startswith('image'): | ||
| 38 | destdir = 'images/' | ||
| 39 | page_list.append({ 'date': asset['upload_updated_at'], 'revision': rev, 'comment': "asset {}".format(asset['name']), 'fname': source, 'gname': destdir+asset['upload_file_name'], 'editor': 'admin', 'email': 'admin@cccms.de'}) | ||
| 40 | rev = rev + 1 | ||
| 41 | redirect_list['/public/system/uploads/{}/original/{}'.format(asset['id'], asset['upload_file_name'])] = destdir+asset['upload_file_name'] | ||
| 42 | |||
| 43 | with open('redirects.json', 'w') as outfile: | ||
| 44 | json.dump(redirect_list, outfile) | ||
| 45 | |||
| 46 | cursor.execute("select id, unique_name, created_at from nodes order by id", []) | ||
| 47 | nodes = cursor.fetchall() | ||
| 48 | |||
| 49 | for node in nodes: | ||
| 50 | if not 'unique_name' in node: | ||
| 51 | print ("WARNING: NO unique_name in node " + str(node.id)) | ||
| 52 | continue | ||
| 53 | |||
| 54 | if node['id'] == 1: | ||
| 55 | continue | ||
| 56 | |||
| 57 | if "/" in str(node['unique_name']): | ||
| 58 | pathlib.Path(os.path.dirname('dump/'+node['unique_name'])).mkdir(parents=True, exist_ok=True) | ||
| 59 | if str(node['unique_name']).startswith('updates/'): | ||
| 60 | pathlib.Path(os.path.dirname('git/'+node['unique_name'])).mkdir(parents=True, exist_ok=True) | ||
| 61 | else: | ||
| 62 | pathlib.Path(os.path.dirname('git/pages/'+node['unique_name'])).mkdir(parents=True, exist_ok=True) | ||
| 63 | |||
| 64 | cursor.execute("select id, revision, created_at, updated_at, published_at, user_id, editor_id from pages where node_id = %s", [node['id']]) | ||
| 65 | pages = cursor.fetchall() | ||
| 66 | |||
| 67 | # print ( "WORKING ON NODE: " + str(node['id'])) | ||
| 68 | |||
| 69 | for page in pages: | ||
| 70 | |||
| 71 | # ignoring unpublished pages | ||
| 72 | if not page['published_at']: | ||
| 73 | # print("UNPUBLISHED: {}\n".format(node['id'])) | ||
| 74 | continue | ||
| 75 | |||
| 76 | cursor.execute("select name from tags where id in (select tag_id from taggings where taggable_id = %s)", [page['id']]) | ||
| 77 | tags = cursor.fetchall() | ||
| 78 | |||
| 79 | editor = "admin" | ||
| 80 | if page['editor_id']: | ||
| 81 | cursor.execute("select login from users where id = %s", [page['editor_id']]) | ||
| 82 | e = cursor.fetchall() | ||
| 83 | if len(e): | ||
| 84 | editor = e[0].get("login", "NO-LOGIN") | ||
| 85 | |||
| 86 | creator = "admin" | ||
| 87 | if page['user_id']: | ||
| 88 | cursor.execute("select login from users where id = %s", [page['user_id']]) | ||
| 89 | c = cursor.fetchall() | ||
| 90 | if len(c): | ||
| 91 | creator = c[0].get("login", "NO-LOGIN") | ||
| 92 | |||
| 93 | previewimage = '' | ||
| 94 | cursor.execute("select upload_file_name from related_assets full join assets on related_assets.asset_id = assets.id where related_assets.page_id = %s and position = 1;", [page['id']]) | ||
| 95 | related_asset = cursor.fetchall() | ||
| 96 | if len(related_asset): | ||
| 97 | previewimage = related_asset[0].get('upload_file_name') | ||
| 98 | |||
| 99 | cursor.execute("select locale, title, abstract, body, created_at, updated_at from page_translations where page_id = %s", [page['id']]) | ||
| 100 | page_translations = cursor.fetchall() | ||
| 101 | for translation in page_translations: | ||
| 102 | if not translation.get("title"): | ||
| 103 | continue | ||
| 104 | |||
| 105 | if len(tags) and not isinstance(tags[0], str): | ||
| 106 | tags = [y for x in tags for y in x] | ||
| 107 | |||
| 108 | if translation['locale'] == 'en': | ||
| 109 | fname = "dump/" + node['unique_name'] + '.en.md:' + str(page['revision']) + '@' + str(int(page['updated_at'].timestamp())) | ||
| 110 | gitname = node['unique_name'] + '.en.md' | ||
| 111 | else: | ||
| 112 | fname = "dump/" + node['unique_name'] + '.md:' + str(page['revision']) + '@' + str(int(page['updated_at'].timestamp())) | ||
| 113 | gitname = node['unique_name'] + '.md' | ||
| 114 | |||
| 115 | if not gitname.startswith('updates/'): | ||
| 116 | gitname = 'pages/' + gitname | ||
| 117 | |||
| 118 | #lang = detect(translation.get('body')) | ||
| 119 | #print('{}:{} node_id {: 5d} revision {: 3d} with page_id {: 5d}, path {}'.format(lang, translation['locale'], node['id'], page['revision'], page['id'], node['unique_name'])) | ||
| 120 | |||
| 121 | # md = pypandoc.convert_text(str(translation.get('body', "")), 'md', format='html', extra_args=['--atx-headers']) | ||
| 122 | # rst = pypandoc.convert_text(str(translation.get('body', "")), 'rst', format='html', extra_args=['--atx-headers']) | ||
| 123 | # with open( "dump/" + node['unique_name'] + '.' + translation['locale'] + '.md.html:' + str(page['revision']) + '@' + str(int(page['updated_at'].timestamp())), "w") as f: | ||
| 124 | # f.write(pypandoc.convert_text( md, 'html', format='md', extra_args=['--atx-headers'])) | ||
| 125 | # with open( "dump/" + node['unique_name'] + '.' + translation['locale'] + '.rst.html:' + str(page['revision']) + '@' + str(int(page['updated_at'].timestamp())), "w") as f: | ||
| 126 | # f.write(pypandoc.convert_text( rst, 'html', format='rst', extra_args=['--atx-headers'])) | ||
| 127 | |||
| 128 | with open(fname, "w") as f: | ||
| 129 | f.write("title: {}\n".format(' '.join(translation.get("title", "").split()))) | ||
| 130 | f.write("date: {}\n".format(page['published_at'].strftime("%Y-%m-%d %H:%M:%S %z"))) | ||
| 131 | if page.get('updated_at'): | ||
| 132 | f.write("updated: {}\n".format(page['updated_at'].strftime("%Y-%m-%d %H:%M:%S %z"))) | ||
| 133 | f.write("author: {}\n".format(creator)) | ||
| 134 | f.write("tags: {}\n".format(', '.join(tags).lower())) | ||
| 135 | if previewimage: | ||
| 136 | f.write("previewimage: /images/{}\n".format(previewimage)) | ||
| 137 | f.write("\n") | ||
| 138 | |||
| 139 | # Add abstract, if one is there | ||
| 140 | if translation.get('abstract'): | ||
| 141 | f.write(str(translation.get('abstract', ""))) | ||
| 142 | f.write("\n\n") | ||
| 143 | if translation.get('body'): | ||
| 144 | f.write("<!-- TEASER_END -->\n\n") | ||
| 145 | |||
| 146 | f.write(pypandoc.convert_text( str(translation.get('body', "")), 'markdown-smart', format='html-native_divs-native_spans', extra_args=['--atx-headers'])) | ||
| 147 | userrec = next(filter(lambda person: person['login'] == editor, user)) | ||
| 148 | page_list.append({ 'date': page['updated_at'], 'revision': page['revision'], 'comment': "page revision {}".format(page['revision']), 'fname': fname, 'gname': gitname, 'editor': editor, 'email': userrec['email']}) | ||
| 149 | |||
| 150 | page_list.sort(key=lambda tup: (tup['date'], tup['revision'])) | ||
| 151 | for page in page_list: | ||
| 152 | print(page) | ||
| 153 | os.system("cp {} git/{}".format(page['fname'], page['gname'])) | ||
| 154 | os.system("git -C git/ add {}".format(page['gname'])) | ||
| 155 | os.environ['GIT_COMMITTER_NAME'] = page['editor'] | ||
| 156 | os.environ['GIT_COMMITTER_EMAIL'] = page['email'] | ||
| 157 | os.system('git -C git/ commit -m "committing {}" --author="{} <{}>" --date="{}"'.format(page['comment'], page['editor'], page['email'], page['date'])) | ||
| 158 | |||
