module Hailo::Parse

Direct including types

Defined in:

hailo/parse.cr

Constant Summary

ABBREV = /#{ALPHABET}(?:\.#{ALPHABET})++\./
ADDRESS = /:/
ALPHABET = /(?:[\p{L}\p{M}])/
APOST_WORD = /#{ALPHABET}++(?:#{APOSTROPHE}#{ALPHABET}++)++/
APOSTROPHE = /['’´]/
BARE_WORD = /#{WORD_CHAR}++/
BOUNDARY = /#{CLOSE_QUOTE}?(?:\s*#{TERMINATOR}|#{ADDRESS})\s+#{OPEN_QUOTE}?\s*/
CLOSE_QUOTE = /['"’“”«»」』›‘]/
CLOSE_TAG = /<\/(?:-|#{WORD_CHAR})+>/
CURRENCY = /[¤¥¢£\$]/
DASH = /[–-]/
DATE = /[0-9]{4}-[Ww]?[0-9]{1,2}-[0-9]{1,2}/
DATETIME = /#{DATE}[Tt]#{TIME}/
DOTTED = /#{BARE_WORD}?\.#{BARE_WORD}(?:\.#{BARE_WORD})*+/
DOTTED_STRICT = /#{LOOSE_WORD}(?:#{POINT}(?:\d+|#{WORD_CHAR}{2,}))?/
ELLIPSIS = /(?:\.{2,}|…)/
EMAIL = /[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+(?:\.[A-Za-z]{2,4})*/
ESC_SPACE = /(?:\\ )+/
FILENAME = /(?:#{NAME})?\.#{NAME}(?:\.#{NAME})*|#{NAME}/
HOST_WORD = /#{BARE_WORD}(?:-+#{BARE_WORD})*/
HOSTNAME = /#{HOST_WORD}(?:\.#{HOST_WORD})*/
IPV4 = /[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}/
IPV6 = /(?:(?:[0-9A-Fa-f]{0,4})?:){1,7}[0-9A-Fa-f]{0,4}/
IRC_CHAN = /[#&+][^ \a\0\012\015,:]{1,199}/
IRC_NICK = /<(?: |[&~]?[@%+~&])?[A-Za-z_`\-^\|\\\{\}\[\]][A-Za-z_0-9`\-^\|\\\{}\[\]]+>/
LOOSE_WORD = /#{IRC_CHAN}|#{DATETIME}|#{DATE}|#{TIME}|#{PATH}|#{NUMBER}|#{ABBREV}|#{APOST_WORD}|#{NUMERO}|#{BARE_WORD}(?:#{DASH}(?:#{WORD_TYPES}|#{BARE_WORD})|#{APOSTROPHE}(?!#{ALPHABET}|#{NUMBER}|#{APOSTROPHE})|#{DASH}(?!#{DASH}{2}))*/
NAME = /(?:#{BARE_WORD}|#{ESC_SPACE})+/
NON_WORD = /#{NWORD_CHAR}++/
NONSPACE = /\S/
NUMBER = /#{CURRENCY}?+#{POINT}\d++(?:#{POINT}\d++)*+(?:#{CURRENCY}|#{ALPHABET}++)?+|#{CURRENCY}?+\d++(?:#{POINT}\d++)*+(?:#{CURRENCY}|#{ALPHABET}++)?+(?!\d|#{ALPHABET})/
NUMERO = /#[0-9]+/
NWORD_CHAR = /[^\s\p{L}\p{M}\d_]/
OPEN_QUOTE = /['"‘“„«»「『‹‚]/
OPT_LONG = /--#{OPT_PART}(?:-#{OPT_PART})*/
OPT_PART = /(?:#{ALPHABET}|\d)(?:#{ALPHABET}|[_\d])+/
OPT_SHORT = /-(?:#{ALPHABET}|\d)(?!#{ALPHABET}|\d)/
PAREN_TIME = /\(#{TIME}\)/
PATH = /#{UNIX_PATH}|#{WIN_PATH}/
PERL_CLASS = /(?:::\w+(?:::\w+)*|\w+(?:::\w+)+)(?:::)?|\w+::/
POINT = /[.,]/
PORT = /:[0-9]+/
PUNCTUATION = /[?!‽,;.:]/
RX_APOST = {"'" => /[’´](?!#{ALPHABET}|#{NUMBER})/, "’" => /['´](?!#{ALPHABET}|#{NUMBER})/, "´" => /['’](?!#{ALPHABET}|#{NUMBER})/}
RX_APOSTROPHE = /#{APOSTROPHE}/
RX_CAPITALIZE_FIRST = /^\s*#{OPEN_QUOTE}?\s*\K#{SPLIT_WORD}(?=#{ELLIPSIS}|(?:(?:#{CLOSE_QUOTE}|#{TERMINATOR}|#{ADDRESS}|#{PUNCTUATION}+)?(?:\s|$)))/
RX_CAPITALIZE_IM = /(?:(?:#{ELLIPSIS}|\s+)|#{OPEN_QUOTE})\Ki(?=#{APOSTROPHE}#{ALPHABET}|\s|#{PUNCTUATION}|$)/
RX_CAPITALIZE_REST_A = /(?:#{ELLIPSIS}|\s+)#{OPEN_QUOTE}?\s*#{WORD_STRICT}#{BOUNDARY}\K#{SPLIT_WORD}/
RX_CAPITALIZE_REST_B = /#{SEPARATOR}#{WORD_STRICT}#{SEPARATOR}#{BOUNDARY}\K#{SPLIT_WORD}/
RX_CAPITALIZE_SECOND = /^#{SPLIT_WORD}(?:\s*#{TERMINATOR}|#{ADDRESS})\s+\K#{SPLIT_WORD}/
RX_DASH_NEWL = /(#{DASH})\s*\n+\s*/
RX_END_PARAGRAPH = /(?:#{ELLIPSIS}|\s+|^)#{OPEN_QUOTE}?(?:#{SPLIT_WORD}(?:\.#{SPLIT_WORD})*)\K(#{CLOSE_QUOTE}?)$/
RX_MIXED_CASE = /\p{Ll}+\p{Lu}|\p{Lu}{2,}\p{Ll}|(?:\p{Lu}+#{NWORD_CHAR}+)(?<!I')(?:\p{Lu}*\p{Ll})/
RX_NEWLINE = /\s*\n+\s*/
RX_NON_SPACE = /\S+/
RX_SPACE = /\s+/
RX_TOKEN_NORMAL = /(?P<word>#{WORD})|(?P<non_word>#{NON_WORD})/
RX_TOKEN_SPECIAL = /(?P<special>#{SPECIAL_WORD})|#{RX_TOKEN_NORMAL}/
RX_WORD_APOST = /#{APOSTROPHE}(?!#{ALPHABET}|#{NUMBER})/
SEPARATOR = "\b"
SPACE = /\s/
SPECIAL_WORD = /(?>#{URI}|#{OPT_SHORT}|#{OPT_LONG}|#{CLOSE_TAG}|#{IRC_NICK}|#{IRC_CHAN}|#{DATETIME}|#{DATE}|#{TIME}|#{PAREN_TIME}|#{SQUARE_TIME}|#{PERL_CLASS}|#{EMAIL}|#{TWAT_NAME}|#{PATH}|#{NUMERO})/
SPLIT_WORD = /#{LOOSE_WORD}(?:\/#{LOOSE_WORD})?(?=#{PUNCTUATION}(?:\s+|$)|#{CLOSE_QUOTE}|#{TERMINATOR}|\s+|$)/
SQUARE_TIME = /\[#{TIME}\]/
TERMINATOR = /[?!‽]+|(?<!\.)\./
TIME = /[0-9]{1,2}:[0-9]{2}(?::[0-9]{2})?(?:[Zz]| ?(?:am|AM|pm|PM)|[-+±][0-9]{2}(?::?[0-9]{2})?)?/
TWAT_NAME = /@[A-Za-z0-9_]+/
UNIX_PATH = /\/#{FILENAME}(?:\/#{FILENAME})*\/?/
URI = /#{URI_SCHEME}(?:#{HOSTNAME}|#{IPV4}|#{IPV6})#{PORT}?#{URI_PATH}?/
URI_EXTRA1 = /[-\d%_.~$!#&'()*+,\/:;=?@\[\]]/
URI_EXTRA2 = /[-\d%_~$!#&'()*+,\/:;=?@\[\]]/
URI_PATH = /\/(?:(?:#{ALPHABET}+|#{URI_EXTRA1}+)*(?:#{ALPHABET}+|#{URI_EXTRA2}+))?/
URI_SCHEME = /(?:#{HOST_WORD}\+)?#{BARE_WORD}:\/\//
WIN_PATH = /#{ALPHABET}:\\#{FILENAME}(?:\\#{FILENAME})*\\?/
WORD = /#{WORD_TYPES}(?:(?:#{DASH}#{WORD_TYPES})++|#{DASH}(?!#{DASH}))?+/
WORD_CHAR = /#{ALPHABET}|[\d_]/
WORD_STRICT = /#{DOTTED_STRICT}(?:#{APOSTROPHE}#{DOTTED_STRICT})*/
WORD_TYPES = /#{NUMBER}|#{ABBREV}|#{DOTTED}|#{APOST_WORD}|#{BARE_WORD}/