第九章字符串处理

9.1 Rust中的字符串类型

Rust中有两种主要的字符串类型：

String：可变的、有所有权的UTF-8编码字符串
str：不可变的字符串slice，通常以&str形式使用

9.2 String类型

创建String

// 空字符串
let mut s = String::new();
 
// 从字符串字面量创建
let s = String::from("hello");
let s = "hello".to_string();
 
// 从其他类型创建
let s = format!("{}-{}-{:?}", "hello", 42, true);

修改String

追加字符串：

let mut s = String::from("foo");
s.push_str("bar");  // foobar
 
let mut s = String::from("lo");
s.push('l');  // lol，push追加单个字符

连接字符串：

let s1 = String::from("Hello, ");
let s2 = String::from("world!");
let s3 = s1 + &s2;  // s1被移动，s2被借用
 
// 使用format!（不会获取所有权）
let s1 = String::from("tic");
let s2 = String::from("tac");
let s3 = String::from("toe");
let s = format!("{}-{}-{}", s1, s2, s3);  // s1, s2, s3都仍然有效

插入和删除：

let mut s = String::from("Hello");
 
// 插入字符
s.insert(5, ',');  // "Hello,"
 
// 插入字符串
s.insert_str(6, " world");  // "Hello, world"
 
// 删除字符（按索引的字节位置）
s.remove(0);  // 删除第一个字符
 
// 清除
s.clear();  // 清空字符串

9.3 字符串索引

Rust不支持按索引访问字符串：

let s = String::from("hello");
// let h = s[0];  // 错误！

原因：UTF-8编码

let hello = "中国人";
 
// 每个汉字占3字节
println!("字节数：{}", hello.len());  // 9
 
// 获取字节
for b in hello.bytes() {
    println!("{}", b);
}

使用chars()遍历Unicode标量值：

for c in "中国人".chars() {
    println!("{}", c);
}
// 输出：中、国、人

使用char_indices()：

for (i, c) in "中国人".char_indices() {
    println!("索引{}：字符'{}'", i, c);
}
// 索引0：字符'中'
// 索引3：字符'国'
// 索引6：字符'人'

9.4 字符串Slice

创建Slice：

let s = String::from("hello world");
let hello = &s[0..5];
let world = &s[6..11];

注意字节边界：

let s = "中国人";
let zhong = &s[0..3];  // "中" - 正确
// let guo = &s[0..2];    // 错误！不是有效的UTF-8边界

9.5 字符串遍历

按字符遍历

for c in "hello 中国人".chars() {
    println!("{}", c);
}

按字节遍历

for b in "hello".bytes() {
    println!("{}", b);
}

按行遍历

let text = "line1\nline2\nline3";
for line in text.lines() {
    println!("{}", line);
}

按单词遍历

let text = "hello world from Rust";
for word in text.split_whitespace() {
    println!("{}", word);
}

9.6 字符串方法

查找和替换

let s = String::from("hello world");
 
// 查找子串位置
let pos = s.find("world");  // Some(6)
let pos = s.find("xyz");    // None
 
// 替换
let new_s = s.replace("world", "Rust");  // "hello Rust"
 
// 替换指定次数
let new_s = s.replacen("l", "L", 1);  // "heLlo world"

分割字符串

let s = "a,b,c,d";
 
// 按字符分割
let parts: Vec<&str> = s.split(',').collect();
 
// 按多个字符分割
let s = "a b\tc\nd";
let parts: Vec<&str> = s.split_whitespace().collect();
 
// 按字符串分割
let s = "hello::world::Rust";
let parts: Vec<&str> = s.split("::").collect();
 
// 限制分割次数
let s = "a,b,c,d";
let parts: Vec<&str> = s.splitn(2, ',').collect();  // ["a", "b,c,d"]

修剪空白

let s = "  hello world  ";
 
// 去除两端
let trimmed = s.trim();      // "hello world"
 
// 去除开头
let trimmed = s.trim_start();
 
// 去除结尾
let trimmed = s.trim_end();
 
// 去除指定字符
let s = "xxxhello worldxxx";
let trimmed = s.trim_matches('x');  // "hello world"

大小写转换

let s = "Hello";
 
println!("{}", s.to_uppercase());  // "HELLO"
println!("{}", s.to_lowercase());  // "hello"

检查

let s = "hello world";
 
// 是否以...开头
assert!(s.starts_with("hello"));
 
// 是否以...结尾
assert!(s.ends_with("world"));
 
// 是否包含
assert!(s.contains("lo wo"));
 
// 是否符合模式
assert!(s.matches("l").count() == 3);

9.7 String与&str的转换

String转&str

let s = String::from("hello");
 
// &操作符
let slice: &str = &s;
 
// as_str()
let slice = s.as_str();
 
// 自动解引用
fn takes_str(s: &str) {}
takes_str(&s);  // 自动转换

&str转String

let slice = "hello";
 
// to_string()
let s = slice.to_string();
 
// String::from()
let s = String::from(slice);
 
// into()
let s: String = slice.into();

9.8 UTF-8处理

获取字符数

let s = "中国人";
 
// 字节数
println!("字节数：{}", s.len());  // 9
 
// 字符数（Unicode标量值）
println!("字符数：{}", s.chars().count());  // 3

字符边界检查

fn safe_slice(s: &str, start: usize, end: usize) -> Option<&str> {
    if s.is_char_boundary(start) && s.is_char_boundary(end) {
        Some(&s[start..end])
    } else {
        None
    }
}
 
let s = "中国人";
println!("{:?}", safe_slice(s, 0, 3));   // Some("中")
println!("{:?}", safe_slice(s, 0, 2));   // None

9.9 字符串与所有权

迭代时消耗所有权

let s = String::from("hello");
 
// into_bytes()消耗所有权
let bytes = s.into_bytes();
// s在这里已失效

避免不必要的克隆

// 不好的做法
let s1 = String::from("hello");
let s2 = s1.clone();  // 深拷贝
let slice = &s2[0..2];
 
// 好的做法
let s1 = String::from("hello");
let slice = &s1[0..2];  // 借用

练习题

练习题9.1：反转字符串

fn reverse(s: &str) -> String {
    s.chars().rev().collect()
}
 
fn main() {
    let s = "hello 中国";
    println!("原字符串：{}", s);
    println!("反转后：{}", reverse(s));
}

练习题9.2：判断回文

fn is_palindrome(s: &str) -> bool {
    let s: String = s.chars()
        .filter(|c| c.is_alphanumeric())
        .map(|c| c.to_lowercase().next().unwrap())
        .collect();
 
    s == s.chars().rev().collect::<String>()
}
 
fn main() {
    println!("{}", is_palindrome("A man, a plan, a canal: Panama"));  // true
    println!("{}", is_palindrome("race a car"));  // false
    println!("{}", is_palindrome("中国人中国"));  // true
}

练习题9.3：字符串统计

fn analyze(s: &str) {
    println!("字符串：{}", s);
    println!("字节数：{}", s.len());
    println!("字符数：{}", s.chars().count());
    println!("单词数：{}", s.split_whitespace().count());
    println!("行数：{}", s.lines().count());
 
    // 统计各类字符
    let mut letters = 0;
    let mut digits = 0;
    let mut spaces = 0;
    let mut others = 0;
 
    for c in s.chars() {
        if c.is_alphabetic() {
            letters += 1;
        } else if c.is_numeric() {
            digits += 1;
        } else if c.is_whitespace() {
            spaces += 1;
        } else {
            others += 1;
        }
    }
 
    println!("字母：{}，数字：{}，空白：{}，其他：{}", 
             letters, digits, spaces, others);
}
 
fn main() {
    let text = "Hello, Rust 2024!\n这是第2行。";
    analyze(text);
}

练习题9.4：CSV解析简化版

fn parse_csv_line(line: &str) -> Vec<&str> {
    line.split(',').map(|s| s.trim()).collect()
}
 
fn main() {
    let csv = "name, age, city\nAlice, 30, New York\nBob, 25, London";
 
    for line in csv.lines() {
        let fields = parse_csv_line(line);
        println!("{:?}", fields);
    }
}

本章小结

本章学习了Rust的字符串处理：

String：可变的、有所有权的字符串
&str：字符串slice，不可变借用
索引限制：Rust不支持字符串索引，因为UTF-8变长编码
遍历方式：chars()、bytes()、lines()等
常用方法：find、replace、split、trim等
UTF-8处理：注意字符边界，使用is_char_boundary检查

理解String和&str的区别，以及UTF-8编码特性，是Rust字符串处理的关键。

目录

第九章 字符串处理