# -*- coding: utf-8 -*-
#
#  script.py - a Sakura Script parser
#  Copyright (C) 2001, 2002 by Tamito KAJIYAMA
#  Copyright (C) 2004-2009 by Shyouzou Sugitani <shy@users.sourceforge.jp>
#
#  This program is free software; you can redistribute it and/or modify it
#  under the terms of the GNU General Public License (version 2) as
#  published by the Free Software Foundation.  It is distributed in the
#  hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
#  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
#  PURPOSE.  See the GNU General Public License for more details.
#

import re
import sys

TOKEN_TAG         = 1
TOKEN_META        = 2
TOKEN_OPENED_SBRA = 3
TOKEN_CLOSED_SBRA = 4
TOKEN_NUMBER      = 5
TOKEN_STRING      = 6

patterns = [
    (TOKEN_TAG, re.compile(r'\\[ehunjcxtpqzy*v0123456789fmia!&+---]|'
                           r'\\[sb][0-9]?|\\w[0-9]|\\_[wqslvVbe+cumna]|'
                           r'\\__[ct]|\\URL')),
    (TOKEN_META, re.compile(r'%month|%day|%hour|%minute|%second|%username|'
                            r'%selfname2?|%keroname|%friendname|%songname|'
                            r'%screen(width|height)|%exh|%et|%m[szlchtep?]|'
                            r'%dms|%j|%c')),
    (TOKEN_NUMBER, re.compile(r'[0-9]+')),
    (TOKEN_OPENED_SBRA, re.compile(r'\[')),
    (TOKEN_CLOSED_SBRA, re.compile(r'\]')),
    (TOKEN_STRING, re.compile(r'(\\\\|\\%|\\\]|[^\\\[\]%0-9])+')),
    (TOKEN_STRING, re.compile(r'[%\\]')),
    ]

SCRIPT_TAG  = 1
SCRIPT_TEXT = 2

TEXT_META   = 1
TEXT_STRING = 2


class ParserError(Exception):

    def __init__(self, message, error='strict',
                 script=None, src=None, column=None, length=None, skip=None):
        if error not in ['strict', 'loose']:
            raise ValueError, 'unknown error scheme: %s' % str(error)
        self.message = message
        self.error = error
        self.script = script or []
        self.src = src or ''
        self.column = column
        self.length = length or 0
        self.skip = skip or 0

    def __getitem__(self, n):
        if n == 0:
            if self.error == 'strict':
                return []
            else:
                return self.script
        elif n == 1:
            if self.error == 'strict' or self.column is None:
                return ''
            else:
                return self.src[self.column + self.skip:]
        else:
            raise IndexError('tuple index out of range')

    def __str__(self):
        if self.column is not None:
            column = self.column
            if self.src:
                dump = ''.join((self.src[:column],
                                '\x1b[7m',
                                (self.src[column:column + self.length] or ' '),
                                '\x1b[m',
                                self.src[column + self.length:]))
            else:
                dump = ''
        else:
            column = '??'
            dump = self.src
        return 'ParserError: column %s: %s\n%s' % (column, self.message, dump)

class Parser:

    def __init__(self, error='strict'):
        if error not in ['strict', 'loose']:
            raise ValueError, 'unknown error scheme: %s' % str(error)
        self.error = error

    def perror(self, msg, position='column', skip=None):
        if position not in ['column', 'eol']:
            raise ValueError, 'unknown position scheme: %s' % str(position)
        if skip not in ['length', 'rest', None]:
            raise ValueError, 'unknown skip scheme: %s' % str(skip)
        if position == 'column':
            column = self.column
            length = self.length
            if skip == 'length':
                skip = length
            elif skip == 'rest':
                skip = len(self.src[column:])
            else:
                skip = 0
        else:
            column = len(self.src)
            length = 0
            skip = 0
        return ParserError(msg, self.error,
                           self.script, self.src, column, length, skip)

    def tokenize(self, s):
        tokens = []
        pos = 0
        end = len(s)
        while pos < end:
            for token, pattern in patterns:
                match = pattern.match(s, pos)
                if match:
                    break
            else:
                raise RuntimeError, 'should not reach here'
            tokens.append((token, s[pos:match.end()]))
            pos = match.end()
        return tokens

    def next_token(self):
        try:
            token, lexeme = self.tokens.pop(0)
        except IndexError:
            raise self.perror('unexpected end of script', position='eol')
        self.column += self.length
        self.length = len(lexeme)
        return token, lexeme

    def parse(self, s):
        if not s: return []
        # tokenize the script
        self.src = s
        self.tokens = self.tokenize(self.src)
        self.column = 0
        self.length = 0
        # parse the sequence of tokens
        self.script = []
        text = []
        string_chunks = []
        scope = 0
        anchor = None
        while self.tokens:
            token, lexeme = self.next_token()
            if token == TOKEN_STRING and lexeme == '\\':
                if string_chunks:
                    text.append((TEXT_STRING, ''.join(string_chunks)))
                if text:
                    self.script.append((SCRIPT_TEXT, tuple(text)))
                raise self.perror('unknown tag', skip='length')
            elif token == TOKEN_STRING and lexeme == '%':
                string_chunks.append(lexeme)
                text.append((TEXT_STRING, ''.join(string_chunks)))
                self.script.append((SCRIPT_TEXT, tuple(text)))
                raise self.perror('unknown meta string', skip='length')
            if token in [TOKEN_NUMBER, TOKEN_OPENED_SBRA,
                         TOKEN_STRING, TOKEN_CLOSED_SBRA]:
                lexeme = lexeme.replace(r'\\', '\\')
                lexeme = lexeme.replace(r'\%', '%')
                string_chunks.append(lexeme)
                continue
            if string_chunks:
                text.append((TEXT_STRING, ''.join(string_chunks)))
                string_chunks = []
            if token == TOKEN_META:
                if lexeme == '%j':
                    argument = self.read_sbra_id()
                    text.append((TEXT_META, lexeme, argument))
                else:
                    text.append((TEXT_META, lexeme))
                continue
            if text:
                self.script.append((SCRIPT_TEXT, tuple(text)))
                text = []
            if lexeme in ['\\a', '\\c', '\\e', '\\t', '\\_e',
                          '\\v', '\\x', '\\y', '\\z', '\\_q',
                          '\\4', '\\5', '\\6', '\\7',
                          '\\2', '\\*', '\\-', '\\+', '\\_+',
                          '\\_n', '\\_V', '\\__c', '\\__t']:
                self.script.append((SCRIPT_TAG, lexeme))
            elif lexeme in ['\\0', '\\h']:
                self.script.append((SCRIPT_TAG, lexeme))
                scope = 0
            elif lexeme in ['\\1', '\\u']:
                self.script.append((SCRIPT_TAG, lexeme))
                scope = 1
            elif lexeme in ['\\s', '\\b', '\\p']:
                argument = self.read_sbra_id()
                self.script.append((SCRIPT_TAG, lexeme, argument))
            elif lexeme.startswith('\\s') or \
                 lexeme.startswith('\\b') or \
                 lexeme.startswith('\\p') or \
                 lexeme.startswith('\\w'):
                num = lexeme[2]
                if lexeme.startswith('\\s') and scope == 1:
                    num = str(int(num) + 10)
                self.script.append((SCRIPT_TAG, lexeme[:2], num))
            elif lexeme in ['\\_w']:
                argument = self.read_sbra_number()
                self.script.append((SCRIPT_TAG, lexeme, argument))
            elif lexeme in ['\\i', '\\j', '\\&', '\\_u', '\\_m']:
                argument = self.read_sbra_id()
                self.script.append((SCRIPT_TAG, lexeme, argument))
            elif lexeme in ['\\_b', '\\_c', '\\_l', '\\_v', '\\m',
                            '\\3', '\\8', '\\9']:
                argument = self.read_sbra_text()
                self.script.append((SCRIPT_TAG, lexeme, argument))
            elif lexeme in ['\\n']:
                if self.tokens and self.tokens[0][0] == TOKEN_OPENED_SBRA:
                    argument = self.read_sbra_text()
                    self.script.append((SCRIPT_TAG, lexeme, argument))
                else:
                    self.script.append((SCRIPT_TAG, lexeme))
            elif lexeme in ['\\URL']:
                buf = [self.read_sbra_text()]
                while self.tokens and self.tokens[0][0] == TOKEN_OPENED_SBRA:
                    buf.append(self.read_sbra_text())
                    buf.append(self.read_sbra_text())
                self.script.append((SCRIPT_TAG, lexeme) + tuple(buf))
            elif lexeme in ['\\!']:
                args = self.split_params(self.read_sbra_text())
                self.script.append((SCRIPT_TAG, lexeme) + tuple(args))
            elif lexeme in ['\\q']:
                if self.tokens and self.tokens[0][0] == TOKEN_OPENED_SBRA:
                    args = self.split_params(self.read_sbra_text())
                    if len(args) != 2:
                        raise self.perror('wrong number of arguments',
                                          skip='length')
                    if len(args[1]) != 1 or not args[1][0][1]:
                        raise self.perror('syntax error (expected an ID)',
                                          skip='length')
                    arg1 = args[0]
                    arg2 = args[1][0][1]
                    self.script.append((SCRIPT_TAG, lexeme, arg1, arg2))
                else:
                    arg1 = self.read_number()
                    arg2 = self.read_sbra_id()
                    arg3 = self.read_sbra_text()
                    self.script.append((SCRIPT_TAG, lexeme, arg1, arg2, arg3))
            elif lexeme in ['\\_s']:
                if self.tokens and self.tokens[0][0] == TOKEN_OPENED_SBRA:
                    args = [arg[0][1] for arg in \
                            self.split_params(self.read_sbra_text())]
                    self.script.append((SCRIPT_TAG, lexeme) + tuple(args))
                else:
                    self.script.append((SCRIPT_TAG, lexeme))
            elif lexeme in ['\\_a']:
                if anchor is None:
                    anchor = self.perror(r'syntax error (unbalanced \_a tag)',
                                         skip='rest')
                    self.script.append(
                        (SCRIPT_TAG, lexeme, self.read_sbra_id()))
                else:
                    anchor = None
                    self.script.append((SCRIPT_TAG, lexeme))
            elif lexeme in ['\\f']:
                args = [arg[0][1] for arg in \
                        self.split_params(self.read_sbra_text())]
                self.script.append((SCRIPT_TAG, lexeme) + tuple(args))
            else:
                raise self.perror('unknown tag (%s)' % lexeme, skip='length')
        if anchor:
            if self.script[-1] == (SCRIPT_TAG, r'\e'):
                self.script.insert(len(self.script) - 1, (SCRIPT_TAG, r'\_a'))
            else:
                self.script.append((SCRIPT_TAG, r'\_a'))
            anchor.script=self.script
            raise anchor
        if string_chunks:
            text.append((TEXT_STRING, ''.join(string_chunks)))
        if text:
            self.script.append((SCRIPT_TEXT, tuple(text)))
        return self.script

    def read_number(self):
        token, number = self.next_token()
        if token != TOKEN_NUMBER:
            raise self.perror('syntax error (expected a number)')
        return number

    def read_sbra_number(self):
        token, lexeme = self.next_token()
        if token != TOKEN_OPENED_SBRA:
            raise self.perror('syntax error (expected a square bracket)')
        token, number = self.next_token()
        if token != TOKEN_NUMBER:
            raise self.perror('syntax error (expected a number)',
                              skip='length')
        token, lexeme = self.next_token()
        if token != TOKEN_CLOSED_SBRA:
            raise self.perror('syntax error (expected a square bracket)',
                              skip='length')
        return number

    def read_sbra_id(self):
        text = self.read_sbra_text()
        if len(text) != 1:
            raise self.perror('syntax error (expected a single ID)',
                              skip='length')
        try:
            sbra_id = str(int(text[0][1]))
        except:
            pass
        else:
            return sbra_id
        return text[0][1]

    def read_sbra_text(self):
        token, lexeme = self.next_token()
        if token != TOKEN_OPENED_SBRA:
            raise self.perror('syntax error (expected a square bracket)')
        text = []
        string_chunks = []
        while self.tokens:
            token, lexeme = self.next_token()
            if token in [TOKEN_NUMBER, TOKEN_STRING, TOKEN_OPENED_SBRA,
                         TOKEN_TAG]:
                lexeme = lexeme.replace(r'\\', '\\')
                lexeme = lexeme.replace(r'\%', '%')
                lexeme = lexeme.replace(r'\]', ']')
                string_chunks.append(lexeme)
                continue
            if string_chunks:
                text.append((TEXT_STRING, ''.join(string_chunks)))
                string_chunks = []
            if token == TOKEN_CLOSED_SBRA:
                break
            elif token == TOKEN_META:
                text.append((TEXT_META, lexeme))
            else:
                raise self.perror('syntax error (wrong type of argument)',
                                  skip='length')
        else:
            raise self.perror('unexpected end of script', position='eol')
        return tuple(text)

    re_param = re.compile('("[^"]*"|[^,])*')
    re_quote = re.compile('"([^"]*)"')

    def split_params(self, text):
        params = []
        buf = []
        for token, lexeme in text:
            i = 0
            j = len(lexeme)
            if token == TEXT_STRING:
                while i < j:
                    match = self.re_param.match(lexeme, i)
                    if not match:
                        break
                    param, n = self.re_quote.subn(
                        lambda m: m.group(1), match.group())
                    if param or not buf:
                        buf.append((token, param))
                    params.append(tuple(buf))
                    buf = []
                    i = match.end()
                    if i < j:
                        assert lexeme[i] == ','
                        i += 1
            if i < j:
                buf.append((token, lexeme[i:]))
        if buf:
            params.append(tuple(buf))
        return params


# Tests

testcases = [
    # legal cases
    r'\s[4]ちゃんと選んでよう〜っ。\w8\uまあ、ユーザさんも忙しいんやろ‥‥\e',
    r'%selfnameと%keroname\e',
    r'エスケープのテスト \\, \%, [, ], \] どーかな?\e',
    r'\j[http://www.asahi.com]\e',
    r'\j[http://www.asahi.com/[escape\]/\%7Etest]\e',
    r'\j[http://www.asahi.com/%7Etest/]\e',
    r'\h\s[0]%usernameさんは今どんな感じ？\n\n\q0[#temp0][まあまあ]\q1[#temp1][今ひとつ]\z',
    r'\q0[#temp0][今日は%month月%day日だよ]\e',
    r'\q0[#cancel][行かない]\q1[http://www.asahi.com/%7Etest/][行く]\e',
    r'\q[テスト,test]\q[%month月%day日,date]\e',
    r'\q[テスト,http://www.asahi.com/]\e',
    r'\q[テスト,http://www.asahi.com/%7Etest/]\e',
    r'\h\s[0]%j[#temp0]\e',
    r'\URL[http://www.asahi.com/]\e',
    r'\URL[http://www.asahi.com/%7Etest/]\e',
    r'\URL[行かない][http://www.asahi.com/][トップ][http://www.asahi.com/%7Etest/][テスト]\e',
    r'\_s\s5\w44えんいー%c\e',
    r'\h%m?\e',
    r'\URL[http://www.foo.jp/%7Ebar/]',
    r'\b[0]\b[normal]\i[0]\i[eyeblink]',
    r'\c\x\t\_q\*\1\2\4\5\-\+\_+\a\__c\__t\_n',
    r'\_l[0,0]\_v[test.wav]\_V\_c[test]',
    r'\h\s0123\u\s0123\h\s1234\u\s1234',
    r'\s[-1]\b[-1]',
    r'\_u[0x0010]\_m[0x01]\&[Uuml]\&[uuml]',
    r'\n\n[half]\n',
    r'\![open,teachbox]\e',
    r'\![raise,OnUserEvent,"0,100"]\e',
    r'\![raise,"On"User"Event",%username,,"",a"","""","foo,bar"]\e',
    r'\_a[http://www.asahi.com/]Asahi.com\_a\_s\_a[test]foo\_a\e',
    r'\_a[test]%j[http://www.asahi.com]%hour時%minute分%second秒\_a',
    r'\![raise,OnWavePlay,voice\hello.mp3]\e',
    r'\q[Asahi.com,新聞を読む]',
    r'\j[\s4]\e',
    r'\p[2]\s[100]3人目',
    r'\_s[0,2]keroは\_s仲間はずれ\_sです。\e',
    # illegal cases (to be passed)
    r'20%終了 (%hour時%minute分%second秒)',
    r'\g',
    # illegal cases
    r'\j[http://www.asahi',
    r'\s\e',
    r'\j4\e',
    r'\q0[#temp0]\e',
    r'\q[test]\e',
    r'\q[foo,bar,test]\e',
    r'\q[起動時間,%exh時間]\e',
    r'\q[,]\e',
    r'\URL[しんぶーん][http://www.asahi.com/]\e',
    r'\_atest\_a',
    r'\_a[test]',
    r'\s[normal]',
    r'\s[0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001]',
    ]

def test_tokenizer():
    parser = Parser()
    for test in testcases:
        try:
            print parser.tokenize(test)
        except ParserError, e:
            print e

def test_parser(error='strict'):
    parser = Parser(error)
    for test in testcases:
        print '*' * 60
        print test
        script = []
        while 1:
            try:
                script.extend(parser.parse(test))
            except ParserError, e:
                print '-' * 60
                print e
                done, test = e
                script.extend(done)
            else:
                break
        print '-' * 60
        print_script_tree(script)

def print_script_tree(tree):
    for node in tree:
        if node[0] == SCRIPT_TAG:
            name, args = node[1], node[2:]
            print 'TAG', name
            for n in range(len(args)):
                if isinstance(args[n], str):
                    print '\tARG#%d\t%s' % (n + 1, args[n])
                else:
                    print '\tARG#%d\tTEXT' % (n + 1)
                    print_text(args[n], 2)
        elif node[0] == SCRIPT_TEXT:
            print 'TEXT'
            print_text(node[1], 1)

def print_text(text, indent):
    for chunk in text:
        if chunk[0] == TEXT_STRING:
            print ''.join(('\t' * indent, 'STRING\t"%s"' % chunk[1]))
        elif chunk[0] == TEXT_META:
            name, args = chunk[1], chunk[2:]
            print ''.join(('\t' * indent, 'META\t', name))
            for n in range(len(args)):
                print ''.join(('\t' * indent, '\tARG#%d\t%s' % (n + 1, args[n])))

if __name__ == '__main__':
    import os
    if len(sys.argv) == 2 and sys.argv[1] == 'tokenizer':
        test_tokenizer()
    elif len(sys.argv) == 3 and sys.argv[1] == 'parser':
        test_parser(sys.argv[2])
    else:
        print 'Usage:', os.path.basename(sys.argv[0]), \
              '[tokenizer|parser [strict|loose]]'

# Syntax of the Sakura Script:
#   "\e"
#   "\h"
#   "\u"
#   "\s" OpenedSbra Number ClosedSbra
#   "\b" OpenedSbra Number ClosedSbra
#   "\n" (OpenedSbra Text ClosedSbra)?
#   "\w" Number
#   "\_w" OpenedSbra Number ClosedSbra
#   "\j" OpenedSbra ID ClosedSbra
#   "\c"
#   "\x"
#   "\t"
#   "\_q"
#   "\_s"
#   "\_n"
#   "\q" Number OpenedSbra Text ClosedSbra OpenedSbra Text ClosedSbra
#   "\q" OpenedSbra Text "," ID ClosedSbra
#   "\z"
#   "\y"
#   "\*"
#   "\v"
#   "\8" OpenedSbra ID ClosedSbra
#   "\m" OpenedSbra ID ClosedSbra
#   "\i" OpenedSbra ID ClosedSbra
#   "\_e"
#   "\a"
#   "\!" OpenedSbra Text ClosedSbra
#   "\_c" OpenedSbra Text ClosedSbra
#   "\__c"
#   "\URL" OpenedSbra Text ClosedSbra [ OpenedSbra Text ClosedSbra OpenedSbra Text ClosedSbra ]*
#   "\&" OpenedSbra ID ClosedSbra
#   "\_u" OpenedSbra ID ClosedSbra
#   "\_m" OpenedSbra ID ClosedSbra
#   "\_a" OpenedSbra ID ClosedSbra Text "\_a"
