Source code for qddate.qdparser

#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = "Ivan Begtin (ivan@begtin.tech)"
__license__ = "BSD"


from .patterns import ALL_PATTERNS, BASE_TIME_PATTERNS
from pyparsing import Optional, lineStart, oneOf, Literal, restOfLine, ParseException
import datetime
from .dirty import matchPrefix


[docs]class DateParser:
    """Class to use pyparsing-based patterns to parse dates"""

    def __init__(self, generate=True, patterns=ALL_PATTERNS, base_only=False):
        """Inits class DataParser
        :param generate: Boolean value, if true, than automatically generate all patterns from base list self.patterns
        :param patterns: list of patterns to be used. Default ALL_PATTERNS. See qddate.patterns for more info
        :param base_only: Use only base patterns during generation of final list. Filters all patterns with text after datetime.
        """
        self.patterns = patterns
        if generate:
            self.__generate(base_only)
        self.cachedpats = None
        self.ind = []

    def __matchPrefix(self, text):
        """
        This is silver bullet, cornerstone and magic wand of speed of this algorithm
        it filters patterns using manually selected rules. Yes, yes, it's "dirty" code and it could be beautified in
        many ways but this library intended to be fast, not beautiful. Without matching is x1.3 slower so let it be.
        :param text: text with date to match
        :return: list of patterns to run against
        """
        return matchPrefix(text)

[docs]    def startSession(self, cached_p):
        self.cachedpats = [x for x in self.patterns if x["key"] in cached_p]

[docs]    def endSession(self):
        self.cachedpats = None

    def __generate(self, base_only=False):
        """Generates dates patterns"""
        base = []
        texted = []
        for pat in self.patterns:
            data = pat.copy()
            data["basekey"] = data["key"]
            data["key"] += ":time_1"
            data["right"] = True
            data["pattern"] = (
                data["pattern"]
                + Optional(Literal(",")).suppress()
                + BASE_TIME_PATTERNS["pat:time:minutes"]
            )
            data["time_format"] = "%H:%M"
            data["length"] = {
                "min": data["length"]["min"] + 5,
                "max": data["length"]["max"] + 8,
            }
            base.append(data)

            data = pat.copy()
            data["basekey"] = data["key"]
            data["right"] = True
            data["key"] += ":time_2"
            data["pattern"] = (
                data["pattern"]
                + Optional(oneOf([",", "|", 'T'])).suppress()
                + BASE_TIME_PATTERNS["pat:time:full"]
            )
            data["time_format"] = "%H:%M:%S"
            data["length"] = {
                "min": data["length"]["min"] + 9,
                "max": data["length"]["max"] + 9,
            }
            base.append(data)

            data = pat.copy()
            data["basekey"] = data["key"]
            data["right"] = True
            data["key"] += ":time_3"
            data["pattern"] = (
                data["pattern"]
                + Optional(Literal("[")).suppress()
                + BASE_TIME_PATTERNS["pat:time:minutes"]
                + Optional(Literal("]")).suppress()
            )
            data["time_format"] = "%H:%M"
            data["length"] = {
                "min": data["length"]["min"] + 7,
                "max": data["length"]["max"] + 10,
            }
            base.append(data)

            data = pat.copy()
            data["pattern"] = data["pattern"]
            data["right"] = True
            data["basekey"] = data["key"]
            base.append(data)

        if not base_only:
            for pat in base:
                # Right
                data = pat.copy()
                data["key"] += ":t_right"
                data["pattern"] = (
                    lineStart
                    + data["pattern"]
                    + Optional(oneOf([",", "|", ":", ")"])).suppress()
                    + restOfLine.suppress()
                )
                data["length"] = {
                    "min": data["length"]["min"] + 1,
                    "max": data["length"]["max"] + 90,
                }
                texted.append(data)

            base.extend(texted)
        self.patterns = base

[docs]    def match(self, text, noprefix=False, noyear=True):
        """Matches date/datetime string against date patterns and returns pattern and parsed date if matched.
        It's not indeded for common usage, since if successful it returns date as array of numbers and pattern
        that matched this date

        :param text:
            Any human readable string
        :type date_string: str|unicode
        :param noprefix:
            If set True than doesn't use prefix based date patterns filtering settings
        :type noprefix: bool
        :param noyear:
            If set True than does use patterns with noyear flag (does a lot of false positives) if set False doesn't use patterns with noyear flag
        :type noprefix: bool


        :return: Returns dicts with `values` as array of representing parsed date and 'pattern' with info about matched pattern if successful, else returns None
        :rtype: :class:`dict`."""
        n = len(text)
        if self.cachedpats is not None:
            pats = self.cachedpats
        else:
            pats = self.patterns
        if n > 5 and not noprefix:
            basekeys = self.__matchPrefix(text[:6])
        else:
            basekeys = []
        for p in pats:
            if n < p["length"]["min"] or n > p["length"]["max"]:
                continue
            if p["right"] and len(basekeys) > 0 and p["basekey"] not in basekeys:
                continue
            if not noyear and 'noyear' in p.keys() and p['noyear']:
                continue
            try:
                r = p["pattern"].parseString(text)
                # Do sanity check
                d = r.asDict()
                if "month" in d:
                    val = int(d["month"])
                    if val > 12 or val < 1:
                        continue
                if "day" in d:
                    val = int(d["day"])
                    if val > 31 or val < 1:
                        continue
                return {"values": r, "pattern": p}
            except ParseException as e:
                #                print p['key'], text.encode('utf-8'), e
                pass
        return None

[docs]    def parse(self, text, noprefix=False):
        """Parse date and time from given date string.

        :param text:
            Any human readable string
        :type date_string: str|unicode
        :param noprefix:
            If set True than doesn't use prefix based date patterns filtering settings
        :type noprefix: bool


        :return: Returns :class:`datetime <datetime.datetime>` representing parsed date if successful, else returns None
        :rtype: :class:`datetime <datetime.datetime>`."""

        res = self.match(text, noprefix)
        if res:
            r = res["values"]
            p = res["pattern"]
            d = {"month": 0, "day": 0, "year": 0}
            if "noyear" in p and p["noyear"] == True:
                d["year"] = datetime.datetime.now().year
            for k, v in list(r.items()):
                d[k] = int(v)
            dt = datetime.datetime(**d)
            return dt
        return None


if __name__ == "__main__":
    from pprint import pprint

    tests = [
        "01.12.2009",
        "2013-01-12",
        "31.05.2001",
        "7/12/2009",
        "6 Jan 2009",
        "Jan 8, 1098",
        "JAN 1, 2001",
        "3 Января 2003 года",
        "05 Января 2003",
        "12.03.1999 Hello people",
        "15 февраля 2007 года",
        "5 August 2001",
        "3 jun 2009",
        "16 May 2009 14:10",
        "01 february 2009",
        "01.03.2009 14:53",
        "01.03.2009 14:53:12",
        "22.12.2009 17:56",
        "05/16/99",
        "11/29/1991",
        "Thursday 4 April 2019",
        "July 01, 2015",
        "Fri, 3 July 2015",
        u"2 Июня 2015",
        u"9 июля 2015 г.",
        u"26 / 06 ‘15",
        u"09.июля.2015",
        u"14th April 2015:",
        u"23 Jul 2015, 09:00 BST",
        u"пятница, июля 17, 2015",
        u"Июль 16, 2015",
        u"Le 8 juillet 2015",
        u"8 juillet 2015",
        u"Fri 24 Jul 2015",
        u"26 de julho de 2015",
        u"17 de Junio de 2015",
        u"28. Juli 2015",
        u"21 Фeвpyapи 2015",
        u"1 нoeмвpи 2013",
        u"23 июня 2015",
        u"3 Июля, 2015",
        u"7 August, 2015",
        u"Wednesday 22 Apr 2015",
        u"12-08-2015 - 09:00",
        u"08 Jul, 2015",
        u"August 10th, 2015",
        u"junio 9, 2015",
        u"Авг 11, 2015",
        u"Вторник, 18 Август 2015 18:51",
        u"Июль 16th, 2012 | 11:08 пп",
        u"19 август в 16:03",
        u"7 August, 2015",
        u"9 Июля 2015 [11:23]",
    ]

    #    print list(calendar.month_abbr)[1:]
    ind = DateParser(generate=True)
    #    for i in ind.patterns:
    #        print i
    print(len(ind.patterns))
    for text in tests:
        res = ind.match(text)
        print(r)
        if r:
            r = res["values"]
            p = res["pattern"]
            d = {"month": 0, "day": 0, "year": 0}
            if "noyear" in p and p["noyear"] == True:
                d["year"] = datetime.datetime.now().year
            for k, v in list(r.items()):
                d[k] = int(v)
            dt = datetime.datetime(**d)
        else:
            pass

    #    for p in ind.patterns:
    #        pprint(p)
    import dateparser

    for text in tests:
        pass
        # print(dateparser.parse(text))
#    ind.patterns = DATE_DATA_TYPES_RAW