#! venv/bin/python import psycopg2 import psycopg2.extras import pathlib import os.path import datetime import pypandoc import json from datetime import timedelta, datetime, tzinfo #from langdetect import detect import sys conn = psycopg2.connect(database="cccms_dev", user="postgres", password="", host="127.0.0.1") cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) lcursor = conn.cursor() cursor.execute("select id, login, email, admin from users", []) user = cursor.fetchall() page_list = [] redirect_list = {} pathlib.Path(os.path.dirname('git/')).mkdir(parents=True, exist_ok=True) os.system("git -C git/ init") pathlib.Path(os.path.dirname('git/images/')).mkdir(parents=True, exist_ok=True) pathlib.Path(os.path.dirname('git/files/')).mkdir(parents=True, exist_ok=True) pathlib.Path(os.path.dirname('dump/images/')).mkdir(parents=True, exist_ok=True) pathlib.Path(os.path.dirname('dump/files/')).mkdir(parents=True, exist_ok=True) rev = 0 cursor.execute("select id, name, upload_content_type, upload_file_name, upload_updated_at from assets order by id", []) assets = cursor.fetchall() for asset in assets: if not asset.get('upload_file_name'): continue source = "/usr/local/www/cccms/public/system/uploads/{}/original/{}".format(asset['id'], asset['upload_file_name']) destdir = 'files/' if asset['upload_content_type'].startswith('image'): destdir = 'images/' page_list.append({ 'date': asset['upload_updated_at'], 'revision': rev, 'comment': "asset {}".format(asset['name']), 'fname': source, 'gname': destdir+asset['upload_file_name'], 'editor': 'admin', 'email': 'admin@cccms.de'}) rev = rev + 1 redirect_list['/public/system/uploads/{}/original/{}'.format(asset['id'], asset['upload_file_name'])] = destdir+asset['upload_file_name'] with open('redirects.json', 'w') as outfile: json.dump(redirect_list, outfile) cursor.execute("select id, unique_name, created_at from nodes order by id", []) nodes = cursor.fetchall() for node in nodes: if not 'unique_name' in node: print ("WARNING: NO unique_name in node " + str(node.id)) continue if node['id'] == 1: continue if "/" in str(node['unique_name']): pathlib.Path(os.path.dirname('dump/'+node['unique_name'])).mkdir(parents=True, exist_ok=True) if str(node['unique_name']).startswith('updates/'): pathlib.Path(os.path.dirname('git/'+node['unique_name'])).mkdir(parents=True, exist_ok=True) else: pathlib.Path(os.path.dirname('git/pages/'+node['unique_name'])).mkdir(parents=True, exist_ok=True) cursor.execute("select id, revision, created_at, updated_at, published_at, user_id, editor_id from pages where node_id = %s", [node['id']]) pages = cursor.fetchall() # print ( "WORKING ON NODE: " + str(node['id'])) for page in pages: # ignoring unpublished pages if not page['published_at']: # print("UNPUBLISHED: {}\n".format(node['id'])) continue cursor.execute("select name from tags where id in (select tag_id from taggings where taggable_id = %s)", [page['id']]) tags = cursor.fetchall() editor = "admin" if page['editor_id']: cursor.execute("select login from users where id = %s", [page['editor_id']]) e = cursor.fetchall() if len(e): editor = e[0].get("login", "NO-LOGIN") creator = "admin" if page['user_id']: cursor.execute("select login from users where id = %s", [page['user_id']]) c = cursor.fetchall() if len(c): creator = c[0].get("login", "NO-LOGIN") previewimage = '' cursor.execute("select upload_file_name from related_assets full join assets on related_assets.asset_id = assets.id where related_assets.page_id = %s and position = 1;", [page['id']]) related_asset = cursor.fetchall() if len(related_asset): previewimage = related_asset[0].get('upload_file_name') cursor.execute("select locale, title, abstract, body, created_at, updated_at from page_translations where page_id = %s", [page['id']]) page_translations = cursor.fetchall() for translation in page_translations: if not translation.get("title"): continue if len(tags) and not isinstance(tags[0], str): tags = [y for x in tags for y in x] if translation['locale'] == 'en': fname = "dump/" + node['unique_name'] + '.en.md:' + str(page['revision']) + '@' + str(int(page['updated_at'].timestamp())) gitname = node['unique_name'] + '.en.md' else: fname = "dump/" + node['unique_name'] + '.md:' + str(page['revision']) + '@' + str(int(page['updated_at'].timestamp())) gitname = node['unique_name'] + '.md' if not gitname.startswith('updates/'): gitname = 'pages/' + gitname #lang = detect(translation.get('body')) #print('{}:{} node_id {: 5d} revision {: 3d} with page_id {: 5d}, path {}'.format(lang, translation['locale'], node['id'], page['revision'], page['id'], node['unique_name'])) # md = pypandoc.convert_text(str(translation.get('body', "")), 'md', format='html', extra_args=['--atx-headers']) # rst = pypandoc.convert_text(str(translation.get('body', "")), 'rst', format='html', extra_args=['--atx-headers']) # with open( "dump/" + node['unique_name'] + '.' + translation['locale'] + '.md.html:' + str(page['revision']) + '@' + str(int(page['updated_at'].timestamp())), "w") as f: # f.write(pypandoc.convert_text( md, 'html', format='md', extra_args=['--atx-headers'])) # with open( "dump/" + node['unique_name'] + '.' + translation['locale'] + '.rst.html:' + str(page['revision']) + '@' + str(int(page['updated_at'].timestamp())), "w") as f: # f.write(pypandoc.convert_text( rst, 'html', format='rst', extra_args=['--atx-headers'])) with open(fname, "w") as f: f.write("title: {}\n".format(' '.join(translation.get("title", "").split()))) f.write("date: {}\n".format(page['published_at'].strftime("%Y-%m-%d %H:%M:%S %z"))) if page.get('updated_at'): f.write("updated: {}\n".format(page['updated_at'].strftime("%Y-%m-%d %H:%M:%S %z"))) f.write("author: {}\n".format(creator)) f.write("tags: {}\n".format(', '.join(tags).lower())) if previewimage: f.write("previewimage: /images/{}\n".format(previewimage)) f.write("\n") # Add abstract, if one is there if translation.get('abstract'): f.write(str(translation.get('abstract', ""))) f.write("\n\n") if translation.get('body'): f.write("\n\n") f.write(pypandoc.convert_text( str(translation.get('body', "")), 'markdown-smart', format='html-native_divs-native_spans', extra_args=['--atx-headers'])) userrec = next(filter(lambda person: person['login'] == editor, user)) page_list.append({ 'date': page['updated_at'], 'revision': page['revision'], 'comment': "page revision {}".format(page['revision']), 'fname': fname, 'gname': gitname, 'editor': editor, 'email': userrec['email']}) page_list.sort(key=lambda tup: (tup['date'], tup['revision'])) for page in page_list: print(page) os.system("cp {} git/{}".format(page['fname'], page['gname'])) os.system("git -C git/ add {}".format(page['gname'])) os.environ['GIT_COMMITTER_NAME'] = page['editor'] os.environ['GIT_COMMITTER_EMAIL'] = page['email'] os.system('git -C git/ commit -m "committing {}" --author="{} <{}>" --date="{}"'.format(page['comment'], page['editor'], page['email'], page['date']))