re
module.
r"\n"
instead of
"\\n".
The
Backslash Plague.
print("\thello") #one tab character
print("\\thello") #a backslash and a lowercase t
print("The \\ is a backslash.")
print(r"The \ is a backslash.")
print("The \\\\ is two backslashes.")
print(r"The \\ is two backslashes.")
hello \thello The \ is a backslash. The \ is a backslash. The \\ is two backslashes. The \\ is two backslashes.
import sys
import re #regular expressions
filename = "infile.txt" #e.g., macOS "/usr/share/dict/words"
try:
lines = open(filename)
except FileNotFoundError:
print(f"Sorry, could not find file \"{filename}\".")
sys.exit(1)
except PermissionError:
print(f"Sorry, no permission to open file \"{filename}\".")
sys.exit(1)
for line in lines:
line = line.rstrip("\n") #Remove the trailing newline.
if re.search("hello, there", line): #or if not
print(line)
lines.close()
sys.exit(0)
You can combine multiple flags with the
bitwise
|
operator.
if re.search("hello, there", line, flags = re.IGNORECASE):
if re.search("^anti", line, flags = re.IGNORECASE):
if re.search("phobia$", line, flags = re.IGNORECASE):
if re.search("^$", line): #Search for empty lines.
if re.search("\\$100", line): #Search for one hundred dollars.
if re.search(r"\$100", line): #Search for one hundred dollars.
if re.search("\\^", line): #Search for a caret.
if re.search(r"\^", line): #Search for a caret.
if re.search(r"^\^", line): #Search for lines that begin with a caret.
if re.search("sep.rate", line, flags = re.IGNORECASE): #Search for separate, seperate, etc.
if re.search("^...u.$", line, flags = re.IGNORECASE): #with the anchors
if re.search("...u.", line, flags = re.IGNORECASE): #without the anchors
if re.search("^b.g$", line, flags = re.IGNORECASE):
if re.search("^p.t$", line, flags = re.IGNORECASE):
if re.search("^.....$", line): #all lines of exactly 5 characters, no more and no less
if re.search(".....", line): #all lines of 5 or more characters
if re.search("^.{5}$", line): #all lines of exactly 5 characters, no more and no less
if re.search("...ism$", line, flags = re.IGNORECASE): #idiological movement
if re.search(".", line): #lines containing a character (or maybe more)
if re.search("[ABCDEFGHIJKLMNOPQRSTUVWXYZ]", line): #lines containing an uppercase letter
if re.search("[A-Z]", line): #no space around dash, can't say [Z-A]
if re.search("^[A-K]", line, flags = re.IGNORECASE): #lines starting with letter in 1st half of alphabet
if re.search("^[L-Z]", line, flags = re.IGNORECASE): #lines starting with letter in 1st half of alphabet
if re.search("[a-z]", line): #any lowercase letter
if re.search("[0-9]", line): #any decimal digit
if re.search("19[0-9][0-9]", line): #any year in the 1900's
if re.search("[CcTt][sz]ar", line): #in search of Russia's imperial past
if re.search("[0-7]", line): #any octal digit
if re.search("[0-9A-Fa-f]", line): #any hexadecimal digit
if re.search("[0-9][0-9][0-9][0-9][0-9]", line): #any zip code
See also
\s,
etc.
if re.search(r"^\d\d\d\d\d$", line): #any zip code
if re.search(r"^\d{5}$", line): #any zip code
"^[A-Z][0-9][A-Z] [0-9][A-Z][0-9]$" #Canadian postal code, e.g. A2B 3C4
Can’t use the letters D, F, I, O, Q or U; first letter can’t be W or Z.
"^[A-CEGHJ-NPR-TVXY][0-9][A-CEGHJ-NPR-TV-Z] [0-9][A-CEGHJ-NPR-TV-Z][0-9]$" r"ˆ[A-CEGHJ-NPR-TVXY]\d[A-CEGHJ-NPR-TV-Z] \d[A-CEGHJ-NPR-TV-Z]\d$"
"^[0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9]$" #social security number
r"^\d\d\d-\d\d-\d\d\d\d$"
r"^\d{3}-\d{2}-\d{4}$"
1-800-737-3783 spells “reserve”.
"^[pqrs][def][pqrs][def][pqrs][tuv][def]$" "[pqrs][def][pqrs][def][pqrs][tuv][def]"
cal -h 6 2019
June 2019
Su Mo Tu We Th Fr Sa
1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30
cal -h 6 2019 | tr ' ' .
.....June.2019........
Su.Mo.Tu.We.Th.Fr.Sa..
...................1..
.2..3..4..5..6..7..8..
.9.10.11.12.13.14.15..
16.17.18.19.20.21.22..
23.24.25.26.27.28.29..
30....................
cal -h 6 2019 | tail -n +3 | tr ' ' .
...................1..
.2..3..4..5..6..7..8..
.9.10.11.12.13.14.15..
16.17.18.19.20.21.22..
23.24.25.26.27.28.29..
30....................
import re #regular expression
import os #operating system
command = "cal -h 6 2019 | tail -n +3"
lines = os.popen(command) #pipe open
count = 0
for line in lines:
line = line.rstrip("\n") #Remove the trailing newline.
if re.search(r"^.{13}\d", line): #or if re.search(r"\d.{8}$", line):
count += 1
print(f"June 2019 countains {count} Thursdays.")
lines.close()
June 2019 countains 4 Thursdays.
"[A-Z]" #any uppercase letter
#any character except an uppercase letter
r"[] !\"#$%&'()*+,./0123456789:;<=>?@[\^_`abcdefghijklmnopqrstuvwxyz{|}~-]"
"[^ABCDEFGHIJKLMNOPQRSTUVWXYZ]"
"[^A-Z]"
"[^A]" #any character except uppercase A
Does
/usr/share/dict/words
have any line with q not followed by u?
if re.search("q[^u]", line, flags = re.IGNORECASE):
if re.search("q$", line, flags = re.IGNORECASE):
if re.search("q[^u]|q$", line, flags = re.IGNORECASE): #The vertical bar means "or".
if re.search("q([^u]|$)", line, flags = re.IGNORECASE):
if re.search("[^A]", line, flags = re.IGNORECASE):
if not re.search("A", line, flags = re.IGNORECASE):
"100"
"[^0-9]100[^0-9]"
r"\D100\D"
r"^100\D"
r"\D100$"
"^100$"
r"(^|\D)100(\D|$)"
"max"
"[^A-Za-z0-9_]max[^A-Za-z0-9]"
r"\Wmax\W"
r"^max\W"
r"\Wmax$"
"^max$"
r"(^|\W)max(\W|$)"
"[ -~]" #any printable ASCII character except tab "[ -~\t]" #any printable ASCII character "[^ -~\t]" #any nonprintable ASCII character
"[A-C]" #search for any of the three characters A, B, C "[AC-]" #search for any of the three characters A, C, dash
"[^BC]" #search for any character except B or C "[B^C]" #search for any of the three characters B, caret, C
"[BC]]" #search for A or B, followed by ] "[]BC]" #search for any of the three characters left bracket, B, C
"[]^-]" #search for any of the three characters ], ^, -
"^Manhattan" "^ Manhattan" "^ Manhattan" "^ Manhattan" "^ *Manhattan"
"21210040" #area code, zip code "212.10040" "212..10040" "212...10040" "212.*10040"
if re.search("^anti.*ism$", line, flags = re.IGNORECASE): #in /usr/share/dict/words
if re.search("^[^aeiou]*a[^aeiou]*e[^aeiou]*i[^aeiou]*o[^aeiou]*u[^aeiou]*$", line, flags = re.IGNORECASE): #in /usr/share/dict/words
Two ways to do the same thing. The second one is better because it’s simpler.
if re.search("^anti.*", line, flags = re.IGNORECASE): #in /usr/share/dict/words
if re.search("^anti", line, flags = re.IGNORECASE): #in /usr/share/dict/words
"prochoice|prolife" "pro(choice|life)" #a*b+a*c = a*(b+c). What would go wrong without the parentheses? "^(anti|pro)(choice|life|abortion)" #six possibilities
"colou?r" #American or British spelling "dialog(ue)?s" #What would go wrong without the parentheses?
"^..*$" #lines consisting of one or more characters "^.+$" #lines consisting of one or more characters
"^.....$" #lines consisting of exactly 5 characters
"^.{5}$" #lines consisting of exactly 5 characters
"^.{5,7}$" #lines consisting of a minimum of 5 and a maximum of 7 characters
"^.{,7}$" #lines consisting of a minimum of 0 and a maximum of 7 characters
"^.{5,}$" #lines consisting of a minimum of 5 characters
#More complicated way to find lines consisting of a minimum of 5 and a maximum of 7 characters.
"^(.{5}|.{6}|.{7})$"
A domain name must consists of one or more dot-separated labels, each starting with a letter. If the label contains additional characters, the last character must be a letter or digit. The characters in the middle of the label could be letters, digits, or hyphens. A label must be less than 64 characters long.
#Search for a line consisting of one label.
"^[a-z]([a-z0-9-]{0,61}[a-z0-9])?$"
#Search for a line consisting of a domain name #of one or more dot-separated labels. r"[a-z]([a-z0-9-]{0,61}[a-z0-9])?(\.[a-z]([a-z0-9-]{0,61}[a-z0-9])?)*"
An email address consists of two parts:
firstPart@secondPart.
Let’s say that the first part can contain the characters
A-Za-z0-9!#$%&'*+/=?^_`{|}~.-
with two restrictions:
#Search for a line consisting of one email address. if re.search(r"^[a-z0-9!#$%&'*+/=?^_`{|}~-]([a-z0-9!#$%&'*+/=?^_`{|}~.-]*[a-z0-9!#$%&'*+/=?^_`{|}~-])?@[a-z]([a-z0-9-]{0,61}[a-z0-9])?(\.[a-z]([a-z0-9-]{0,61}[a-z0-9])?)*$", line, flags = re.IGNORECASE) \ and not re.search(r"\.\..*@"):