Python技巧:字符串处理

拆分含有多种分隔符的字符串

  1. 使用.split()
    注意list.extend()
    append是把整个扩充列表作为一个元素追加到列表最后,而Extend则是把每个元素都作为一个独立的元素扩充到原来的列表

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    s = 'liio;15176|0.0\\0.0,522,65,228,72:p|ts/11+R+'
    def mySplit(s,ds):
    res = [s]
    for i,d in enumerate(ds):
    l = []
    #!!如果分割中途有个分隔符在元素的末尾,则会分割出一个空元素
    #list.extend(sequence) 把一个序列seq的内容展平后添加到列表中
    map(lambda x:l.extend(x.split(d)),res)
    # print '第%s次分割结果(分割符为%s):'%(i+1,d),l
    res = l
    #当x非空时返回
    return [x for x in res if x]
    new = mySplit(s,';|\\.+,,/')
    print 'new[] = ',new
  2. 使用正则表达式re.split()

    1
    2
    3
    4
    5
    6
    7
    8
    9
    #方案二  使用正则表达式
    import re
    s = 'liio;15176|0.0\\0.0,522,65,228,72:p|ts/11+R+'
    #def split(pattern, string, maxsplit=0, flags=0)
    #Split the source string by the occurrences of the pattern,
    #returning a list containing the resulting substrings
    #[]中的每个字符都可作为分割符
    q = re.split(r'[;|\\.+,,/]+',s)
    print q

两种方法对比: s.split()速度更快,但是不能处理多个分割符; 推荐使用正则表达式

判断字符串a是否以字符串b开头或结尾

不仅可以用来处理文本文件,还可以处理磁盘中的文件,筛选出所需的文件类型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import os,stat
f = os.listdir('ceshi/')
print f
s = f[0]
'''
def endswith(self, suffix, start=None, end=None)
Return True if S ends with the specified suffix, False otherwise.
With optional start, test S beginning at that position.
With optional end, stop comparing S at that position.
suffix can also be a tuple of strings to try. 要求是元组!!不能用list
'''
# print s.endswith(('.sh','.py'))
for name in os.listdir('ceshi/'):
if name.endswith(('.sh','.py')):
print name

#修改文件权限前要先查看文件权限,用stat
print os.stat('ceshi/a.sh')
print os.stat('ceshi/a.sh').st_mode
#先把权限代码转换成八进制
#666分别是user,group,others的权限值
print oct(os.stat('ceshi/a.sh').st_mode)
os.chmod('ceshi/a.sh',os.stat('ceshi/a.sh').st_mode | stat.S_IXUSR)
#看来666是最大权限了
print oct(os.stat('ceshi/a.sh').st_mode)

调整字符串中文本的格式

配合正则表达式

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import re
f = open('ceshi/date.txt','r')
#def sub(pattern, repl, string, count=0, flags=0)
"""Return the string obtained by replacing the leftmost
non-overlapping occurrences of the pattern in string by the
replacement repl. repl can be either a string or a callable;
if a string, backslash escapes in it are processed. If it is
a callable, it's passed the match object and must return
a replacement string to be used."""
#首先描述日期格式
#repl中的/是分隔符
#re.sub第一个参数中的每个括号都是正则表达式的一个捕获组,根据左括号的顺序排序
#pattern:匹配规则 repl:要替换成什么样子 string:要替换谁
#rel参数调整了捕获组在原文本中的顺序,/是分隔符
print re.sub('(\d{4})-(\d{2})-(\d{2})',r'\2/\3/\1',f.read())
f.close()
#使用r来输出原始字符串,消除\的转义功能
#字符串什么样子就打印出什么样子,所以叫做原始字符串
print r'\\1
print r'\1'

f = open('ceshi/date.txt','r')
#re.sub()中还可以给组起个名字,用?P<>起名
#月日年是美国的日期格式
print re.sub('(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})',r'\g<month>/\g<day>/\g<year>',f.read())
f.close()

将多个小字符串拼接成一个大的字符串

  1. 直接用+拼接字符串
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    s1 = 'abcdefg'
    s2 = '12345'
    #s1 + s2实际调用的是s1.__add__(s2)这个方法
    #def __add__(self, y)
    #x.__add__(y) <==> x+y
    print s1 + s2
    print str.__add__(s1,s2)
    #这样也可以
    print str.__add__(s1,s2)

    #str.__gt__()
    print s1>s2
1
2
3
4
5
6
7
8
9
from random import randint
l = [str(randint(0,50)) for x in range(10)]
print l
s = ''
#这种方法会不断的创建临时变量,再释放,如果字符串特别长,开销就会很巨大,用str.join()方法
for x in l:
s+=x
print s
print s
  1. 用str.join()
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    def join(self, iterable)
    """
    S.join(iterable) -> string
    Return a string which is the concatenation of the strings in the iterable.
    注意!!支持的是可迭代对象里面的字符串类型!!!!
    The separator between elements is S.
    S是作为分隔符
    """
    #这种方法一次性就连接起来,不存在临时变量的浪费
    print ';'.join(['abc','123','aaaaa'])
    print ''.join(['abc','123','aaaaa'])
    #切记:如果列表里面既有str又有int,flaoat,就不能直接连接!!!!!!!!

    #可以用列表生成式.不过这样做会先形成一个完整的列表,可以用生成器
    l = ['abc','123','aaaaa',11,2]
    print ''.join([str(x) for x in l])
    #生成器方法,将[]改成()即可,非常方便
    #<type 'generator'>
    print type(((str(x) for x in l )))
    print ''.join((str(x) for x in l ))

将字符串进行左,右,居中对齐

  1. 使用str.ljust() str.center() str.rjust()

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    s = 'abc'
    #def ljust(self, width, fillchar=None)
    """
    左对齐
    S.ljust(width[, fillchar]) -> string
    Return S left-justified in a string of length width.
    Padding is done using the specified fill character (default is a space).
    """
    s.ljust(10,'-') #左对齐,一共10个字符的位置,用空格补充空位置
    print s.ljust(10,'-')
    #右对齐
    print s.rjust(10)
    print len(s.rjust(10))
    #居中对齐 def center(self, width, fillchar=None)
    print s.center(10,'_')
  2. 使用 format()函数
    使用map函数和dict.keys()找到键最长的key,

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    s = 'abc'
    #def format(value, format_spec=None)
    """
    format(value[, format_spec]) -> string
    Returns value.__format__(format_spec)
    format_spec defaults to ""
    """
    #左对齐
    print format(s,'<15')
    #右对齐
    print format(s,'>15')
    #居中对齐
    print format(s,'^15')

    d = {'Dist':500.0,'SmallCull':0.04,'farclip':447,'lopadDist':100.0,'trilinear':450}
    #先找到键最长的key,借助map函数!!!!!
    #对齐后
    width = max(map(len,d.keys()))
    for k in d:
    print k.ljust(width),':',d[k]
    #未对齐
    for k in d:
    print k, ':', d[k]

去掉字符串中不需要的字符

  1. 使用.strip() .lstrip() .rstrip()

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    #s.strip()    s.lstrip()   s.rstrip()
    s = ' ABC 123 '
    #def strip(self, chars=None)
    """
    S.strip([chars]) -> string or unicode
    leading:开头的 trailing:后面的
    Return a copy of the string S with leading and trailing whitespace removed.
    If chars is given and not None, remove characters in chars instead.
    If chars is unicode, S will be converted to unicode before stripping!!!注意编码格式!!!
    """
    #去掉两端的空白
    print s.strip()
    #去掉左端的空白
    print s.lstrip()
    #去掉右端的空白
    print s.rstrip()
    s = '---abc+++'
    #去掉-+这两个字符
    print s.strip('-+')
  2. 使用切片+字符串拼接

    1
    2
    3
    #得知道字符串样子,不适合大量无规则文本
    s = 'abc:123'
    print s[:3]+s[4:]
  3. 1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    #s.replace()    re.sub
    s = '\tabc\t123\txyz'
    #replace()只能替换一种字符
    #def replace(self, old, new, count=None)
    """
    S.replace(old, new[, count]) -> string
    Return a copy of string S with all occurrences of substring old replaced by new.
    If the optional argument count is given, only the first count occurrences are replaced.
    """
    print s.replace('\t','')

    使用正则表达式替换
    import re
    s = '\tabc\t123\txyz\ropq'
    #def sub(pattern, repl, string, count=0, flags=0)
    #pattern:匹配规则 repl:要替换成什么样子 string:要替换谁
    print re.sub(r'[\t\r]','',s)
  4. 使用translate()方法
    string和unicode都有translate方法,二者目的相同但用法稍有不同

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    ####### string #######
    import string
    # print string.maketrans('abcxyz','xyzabc')
    s = 'abc1230022xyz'
    print s
    #def translate(self, table, deletechars=None)
    """
    S.translate(table [,deletechars]) -> string
    Return a copy of the string S, where all characters occurring in the optional argument deletechars are removed,
    and the remaining characters have been mapped through the given
    translation table, which must be a string of length 256 or None.
    If the table argument is None, no translation is applied and
    the operation simply removes the characters in deletechars.
    这是translate的原始目的
    """
    #the remaining characters have been mapped through the given
    #translation table
    #通过string.maketrans()得到映射table!!!!!!!!
    print s.translate(string.maketrans('abcxyz','xyzabc'))

    # If the table argument is None, no translation is applied
    # and the operation simply removes the characters in deletechars
    s = 'abc\reesa\nrrt23\t'
    print s.translate(None,'\r\n\t')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
####### unicode #######
#def translate(self, table)
"""
S.translate(table) -> unicode
Return a copy of the string S, where all characters have been mapped
through the given translation table, which must be a mapping of
Unicode ordinals to Unicode ordinals, Unicode strings or None.
Unmapped characters are left untouched.
Characters mapped to None are deleted.
"""
#Characters mapped to None are deleted.
u = u'hahaǔǜhaha'
print u.translate({0x01d4:None,0x01dc:None}) # 0x01d4是ǔ 0x01dc是ǜ

# def fromkeys(S, v=None): restored from __doc__
"""
dict.fromkeys(S[,v]) -> New dict with keys from S and values equal to v.
v defaults to None.
"""
#v defaults to None.
print dict.fromkeys([1,2,3,4])

u = u'12345'
print u.translate(dict.fromkeys([0x0031,0x0032,0x0033,0x0034])) #0x0031是1
0%