Strings of characters, immutable
s = 'Hello World!';
#access element
print(s[0]); #H
print(s[:2]); #He
sl = slice(0, 10, 2);
print(s[sl]); #HloWr
print(s+' ...');
if 'H' in s:
print('Contain H');
#raw string, suppresses actual meaning of escape characters
print(r'raw string\n');
s = 'raw string\n';
print('%r' % s) # output string as raw string
s = 'Hello World!';
#capitalize, capitalizes first letter of string
print(, '*'));
if s.endswith('!'):
print('end with ! ...');
print(s.find('or')); #7
#index, find a string and raise an exception if the string is not found
c = '-';
print(c.join(['a', 'b', 'c']));
print(s.replace('l', '-', 2));
str = "Line1-abcdef, \nLine2-abc, \nLine4-abcd";
import re
print(re.split('\n|, ',str)) # use multiple delimiter
print(' Hello ... '.strip());
print(s.upper()); #HELLO WORLD!
Regular Expression
- ^, beginning of a line
- $, end of a line
- ., matches any single character except newline
- [...], matches any single character in brackets
- [^...], matches any single character not in brackets
- *, matches 0 or more occurrences of preceding expression
- +, matches 1 or more occurrence of preceding expression
- ?, matches 0 or 1 occurrence of preceding expression
- re{n}, match exact n time
- re{n, m}, matches at least n and at most m occurrences
- a|b, matches either a or b
- \w, match word characters
- \W, match none word characters
- <.*>, greedy repetition
- <.*?>, none greedy repetition, matches the smallest number of repetitions
- re.I, performs case-insensitive matching
- re.M, makes $ match the end of a line (not just the end of the string) and makes ^ match the start of any line
import re;
phone = "2004-959-559 # This is Phone Number"
m = re.match(r'(\d+)-(\d+)-\d+.*', phone);
if m:
print(; #2004-959-559 # This is Phone Number
print(; # 2004
print(m.groups()); # ('2004', '959')
s ='\d+', phone);
if s:
print(; #2004
a = re.findall(r'\d+', phone);
print(a); #['2004', '959', '559']
r = re.sub(r'\d', '*', phone);
print(r); # ****-***-***, This is Phone Number
a sequence of code points, immutable
Python keep characters as unicode in memory
type 'str' represents unicode in Python 3, type 'bytes' represent byte string
# unicode to str
s = 'Café';
print(type(s), len(s)); #'unicode', 4
s = u'Café';
print(type(s), len(s)); #'unicode', 4
print(type(s.encode('utf-8'))); #'bytes'
# read string from a file and print it to screen
f = open('temp.txt', 'rb');
l = next(f); #read a line and save it to byte string
print(type(l)) # byte string
l = l.decode('utf-8'); #decode str to unicode
print(type(l)); # 'unicode'
print(l); #陈, print encode unicode to str with utf-8
# get unicode code point
c = ord(u'陈'); #38472
print(chr(38472)); #陈, do not print code point, it is encoded to str
read unicode, output unicode
# read string from a file and print it to screen
f = open('temp.txt', 'r');
l = next(f); # 陈, read a line and save it to byte string
print(type(l)) # unicode
o = open('output.txt', 'w')
o.write(l) # convert byte string to unicode
read byte string, output unicode
# read string from a file and print it to screen
f = open('temp.txt', 'rb');
l = next(f); # 陈, read a line and save it to byte string
print(type(l)) # byte string
o = open('output.txt', 'w') # write with Text IO
o.write(l.decode('utf-8')) # convert byte string to unicode
read byte string, output byte string
# read string from a file and print it to screen
f = open('temp.txt', 'rb');
l = next(f); # 陈, read a line and save it to byte string
print(type(l)) # byte string
o = open('output.txt', 'wb') # write with Byte IO