"""
Test the DataShape lexer.
"""

from __future__ import absolute_import, division, print_function

import unittest

import datashape
from datashape import lexer


class TestDataShapeLexer(unittest.TestCase):

    def check_isolated_token(self, ds_str, tname, val=None):
        # The token name should be a property in parser
        tid = getattr(lexer, tname)
        # Lexing should produce a single token matching the specification
        self.assertEqual(list(lexer.lex(ds_str)),
                         [lexer.Token(tid, tname, (0, len(ds_str)), val)])

    def check_failing_token(self, ds_str):
        # Creating the lexer will fail, because the error is
        # in the first token.
        self.assertRaises(datashape.DataShapeSyntaxError, list, lexer.lex(ds_str))

    def test_isolated_tokens(self):
        self.check_isolated_token('testing', 'NAME_LOWER', 'testing')
        self.check_isolated_token('Testing', 'NAME_UPPER', 'Testing')
        self.check_isolated_token('_testing', 'NAME_OTHER', '_testing')
        self.check_isolated_token('*', 'ASTERISK')
        self.check_isolated_token(',', 'COMMA')
        self.check_isolated_token('=', 'EQUAL')
        self.check_isolated_token(':', 'COLON')
        self.check_isolated_token('[', 'LBRACKET')
        self.check_isolated_token(']', 'RBRACKET')
        self.check_isolated_token('{', 'LBRACE')
        self.check_isolated_token('}', 'RBRACE')
        self.check_isolated_token('(', 'LPAREN')
        self.check_isolated_token(')', 'RPAREN')
        self.check_isolated_token('...', 'ELLIPSIS')
        self.check_isolated_token('->', 'RARROW')
        self.check_isolated_token('?', 'QUESTIONMARK')
        self.check_isolated_token('32102', 'INTEGER', 32102)
        self.check_isolated_token('->', 'RARROW')
        self.check_isolated_token('"testing"', 'STRING', 'testing')
        self.check_isolated_token("'testing'", 'STRING', 'testing')

    def test_integer(self):
        # Digits
        self.check_isolated_token('0', 'INTEGER', 0)
        self.check_isolated_token('1', 'INTEGER', 1)
        self.check_isolated_token('2', 'INTEGER', 2)
        self.check_isolated_token('3', 'INTEGER', 3)
        self.check_isolated_token('4', 'INTEGER', 4)
        self.check_isolated_token('5', 'INTEGER', 5)
        self.check_isolated_token('6', 'INTEGER', 6)
        self.check_isolated_token('7', 'INTEGER', 7)
        self.check_isolated_token('8', 'INTEGER', 8)
        self.check_isolated_token('9', 'INTEGER', 9)
        # Various-sized numbers
        self.check_isolated_token('10', 'INTEGER', 10)
        self.check_isolated_token('102', 'INTEGER', 102)
        self.check_isolated_token('1024', 'INTEGER', 1024)
        self.check_isolated_token('10246', 'INTEGER', 10246)
        self.check_isolated_token('102468', 'INTEGER', 102468)
        self.check_isolated_token('1024683', 'INTEGER', 1024683)
        self.check_isolated_token('10246835', 'INTEGER', 10246835)
        self.check_isolated_token('102468357', 'INTEGER', 102468357)
        self.check_isolated_token('1024683579', 'INTEGER', 1024683579)
        # Leading zeros are not allowed
        self.check_failing_token('00')
        self.check_failing_token('01')
        self.check_failing_token('090')

    def test_string(self):
        # Trivial strings
        self.check_isolated_token('""', 'STRING', '')
        self.check_isolated_token("''", 'STRING', '')
        self.check_isolated_token('"test"', 'STRING', 'test')
        self.check_isolated_token("'test'", 'STRING', 'test')
        # Valid escaped characters
        self.check_isolated_token(r'"\"\b\f\n\r\t\ub155"', 'STRING',
                                  u'"\b\f\n\r\t\ub155')
        self.check_isolated_token(r"'\'\b\f\n\r\t\ub155'", 'STRING',
                                  u"'\b\f\n\r\t\ub155")
        # A sampling of invalid escaped characters
        self.check_failing_token(r'''"\'"''')
        self.check_failing_token(r"""'\"'""")
        self.check_failing_token(r"'\a'")
        self.check_failing_token(r"'\s'")
        self.check_failing_token(r"'\R'")
        self.check_failing_token(r"'\N'")
        self.check_failing_token(r"'\U'")
        self.check_failing_token(r"'\u123g'")
        self.check_failing_token(r"'\u123'")
        # Some unescaped and escapted unicode characters
        self.check_isolated_token(u'"\uc548\ub155 \\uc548\\ub155"', 'STRING',
                                  u'\uc548\ub155 \uc548\ub155')

    def test_failing_tokens(self):
        self.check_failing_token('~')
        self.check_failing_token('`')
        self.check_failing_token('@')
        self.check_failing_token('$')
        self.check_failing_token('%')
        self.check_failing_token('^')
        self.check_failing_token('&')
        self.check_failing_token('-')
        self.check_failing_token('+')
        self.check_failing_token(';')
        self.check_failing_token('<')
        self.check_failing_token('>')
        self.check_failing_token('.')
        self.check_failing_token('..')
        self.check_failing_token('/')
        self.check_failing_token('|')
        self.check_failing_token('\\')

    def test_whitespace(self):
        expected_idval = [(lexer.COLON, None),
                          (lexer.STRING, 'a'),
                          (lexer.INTEGER, 12345),
                          (lexer.RARROW, None),
                          (lexer.EQUAL, None),
                          (lexer.ASTERISK, None),
                          (lexer.NAME_OTHER, '_b')]
        # With minimal whitespace
        toks = list(lexer.lex(':"a"12345->=*_b'))
        self.assertEqual([(tok.id, tok.val) for tok in toks], expected_idval)
        # With spaces
        toks = list(lexer.lex(' : "a" 12345 -> = * _b '))
        self.assertEqual([(tok.id, tok.val) for tok in toks], expected_idval)
        # With tabs
        toks = list(lexer.lex('\t:\t"a"\t12345\t->\t=\t*\t_b\t'))
        self.assertEqual([(tok.id, tok.val) for tok in toks], expected_idval)
        # With newlines
        toks = list(lexer.lex('\n:\n"a"\n12345\n->\n=\n*\n_b\n'))
        self.assertEqual([(tok.id, tok.val) for tok in toks], expected_idval)
        # With spaces, tabs, newlines and comments
        toks = list(lexer.lex('# comment\n' +
                               ': # X\n' +
                               ' "a" # "b"\t\n' +
                               '\t12345\n\n' +
                               '->\n' +
                               '=\n' +
                               '*\n' +
                               '_b # comment\n' +
                               ' \t # end'))
        self.assertEqual([(tok.id, tok.val) for tok in toks], expected_idval)
