JS复杂去重一定要先排序吗？深度解析与性能对比

引言

在日常开发中，数组去重是JavaScript中常见的操作。对于简单数据类型，我们通常会毫不犹豫地使用Set。但当面对复杂对象数组时，很多开发者会产生疑问：复杂去重一定要先排序吗？

这个问题背后其实隐藏着几个更深层次的考量:

排序是否会影响原始数据顺序？
排序的性能开销是否值得？
是否有更优雅的解决方案？

1. 常见的排序去重方案

1.1 传统的排序去重思路

javascript 复制代码

// 先排序后去重的经典写法
function sortThenUnique(arr, key) {
  return arr
    .slice()
    .sort((a, b) => {
      // 避免修改原始数组
      const valueA = key ? a[key] : a;
      const valueB = key ? b[key] : b;
      if (valueA < valueB) return -1;
      if (valueA > valueB) return 1;
      return 0;
    })
    .filter((item, index, array) => {
      if (index === 0) return true; // 保留第一个元素
      const value = key ? item[key] : item;
      const prevValue = key ? array[index - 1][key] : array[index - 1];
      return value !== prevValue; // 仅保留与前一个元素不同的元素
    });
}

1.2 排序去重的优缺点

优点:

代码逻辑相对直观
对于已排序或需要排序的数据，可以一步完成
在某些算法题中可能是必要步骤

缺点:

时间复杂度至少为 O(n log n)
改变了原始数据的顺序
对于不需要排序的场景是额外开销

2. 不排序的去重方案

2.1 基于Map的保持顺序方案

javascript 复制代码

function uniqueByKey(arr, key) {
  const seen = new Map();
  const result = [];

  for (const item of arr) {
    const keyValue = item[key];
    if (!seen.has(keyValue)) {
      seen.set(keyValue, true);
      result.push(item);
    }
  }
  return result;
}

// 支持多个字段的复合键
function uniqueByMultipleKeys(arr, keys) {
  const seen = new Set();
  return arr.filter((item) => {
    const compositeKey = keys.map((key) => item[key]).join("|");
    if (seen.has(compositeKey)) {
      return false;
    }
    seen.add(compositeKey);
    return true;
  });
}

2.2 基于对象的缓存方案

javascript 复制代码

function uniqueByKeyWithObject(arr, key) {
  const cache = {};
  return arr.filter((item) => {
    const keyValue = item[key];
    if (cache[keyValue]) {
      return false;
    }
    cache[keyValue] = true;
    return true;
  });
}

2.3 基于自定义比较函数的方案

javascript 复制代码

function uniqueWithCustomComparator(arr, comparator) {
  return arr.filter((current, index, self) => {
    // 查找第一个相同元素的位置
    return self.findIndex((item) => comparator(item, current)) === index;
  });
}

// 使用示例
const users = [
  { id: 1, name: "Alice", age: 25 },
  { id: 2, name: "Bob", age: 30 },
  { id: 1, name: "Alice", age: 25 }, // 重复
  { id: 1, name: "Alice", age: 26 }, // ID相同但年龄不同
];

const uniqueUsers = uniqueWithCustomComparator(
  users,
  (a, b) => a.id === b.id && a.name === b.name
);

console.log(uniqueUsers);
// [ { id: 1, name: 'Alice', age: 25 }, { id: 2, name: 'Bob', age: 30 } ]

3. 性能对比分析

3.1 时间复杂度对比

方法	时间复杂度	空间复杂度	是否保持顺序
排序后去重	O(n log n)	O(1) 或 O(n)	否
Map去重	O(n)	O(n)	是
对象缓存去重	O(n)	O(n)	是
filter + findIndex	O(n²)	O(1)	是

3.2 实际性能测试

javascript 复制代码

// 性能测试代码示例
function generateTestData(count) {
  return Array.from({length: count}, (_, i) => ({
    id: Math.floor(Math.random() * count / 10), // 产生大量重复
    value: `item-${i}`,
    data: Math.random()
  }));
}

function runPerformanceTest() {
  const data = generateTestData(10000);
  
  console.time('Map去重');
  uniqueByKey(data, 'id');
  console.timeEnd('Map去重');
  
  console.time('排序去重');
  sortThenUnique(data, 'id');
  console.timeEnd('排序去重');
  
  console.time('filter+findIndex');
  uniqueWithCustomComparator(data, (a, b) => a.id === b.id);
  console.timeEnd('filter+findIndex');
}

测试结果趋势:

数据量<1000：各种方法差异不大
数据量1000-10000：Map方案明显占优
数据量>10000：排序方案开始显现劣势

4. 应用场景与选择建议

4.1 什么时候应该考虑排序？

1.需要有序输出时

javascript 复制代码

// 既要去重又要按特定字段排序
const getSortedUniqueUsers = (users) => {
  const uniqueUsers = uniqueByKey(users, 'id');
  return uniqueUsers.sort((a, b) => a.name.localeCompare(b.name));
};

2. 数据本身就需要排序时

javascript 复制代码

// 如果业务本来就需要排序，可以合并操作
const processData = (data) => {
  // 先排序便于后续处理
  data.sort((a, b) => a.timestamp - b.timestamp);
  // 去重
  return uniqueByKey(data, 'id');
};

3.处理流式数据时

javascript 复制代码

// 实时数据流，需要维持有序状态
class SortedUniqueCollection {
  constructor(key) {
    this.key = key;
    this.data = [];
    this.seen = new Set();
  }
  
  add(item) {
    const keyValue = item[this.key];
    if (!this.seen.has(keyValue)) {
      this.seen.add(keyValue);
      // 插入到正确位置维持有序
      let index = 0;
      while (index < this.data.length && 
             this.data[index][this.key] < keyValue) {
        index++;
      }
      this.data.splice(index, 0, item);
    }
  }
}

4.2 什么时候应该避免排序？

1.需要保持原始顺序时

javascript 复制代码

// 日志记录、时间线数据等
const logEntries = [
  {id: 3, time: '10:00', message: '启动'},
  {id: 1, time: '10:01', message: '初始化'},
  {id: 3, time: '10:02', message: '启动'}, // 重复
  {id: 2, time: '10:03', message: '运行'}
];

// 保持时间顺序很重要！
const uniqueLogs = uniqueByKey(logEntries, 'id');

2.性能敏感的应用

javascript 复制代码

// 实时渲染大量数据
function renderItems(items) {
  // 使用Map去重避免不必要的排序开销
  const uniqueItems = uniqueByKey(items, 'id');
  // 快速渲染
  return uniqueItems.map(renderItem);
}

3. 数据不可变要求

javascript 复制代码

// React/Vue等框架中，避免改变原数组
const DeduplicatedList = ({ items }) => {
  // 不改变原始数据
  const uniqueItems = useMemo(
    () => uniqueByKey(items, 'id'),
    [items]
  );
  return <List items={uniqueItems} />;
};

5. 高级技巧和优化

5.1 惰性去重迭代器

javascript 复制代码

function* uniqueIterator(arr, getKey) {
  const seen = new Set();
  for (const item of arr) {
    const key = getKey(item);
    if (!seen.has(key)) {
      seen.add(key);
      yield item;
    }
  }
}

// 使用示例
const data = [...]; // 大数据集
for (const item of uniqueIterator(data, x => x.id)) {
  // 逐个处理，节省内存
  processItem(item);
}

5.2 增量去重

javascript 复制代码

class IncrementalDeduplicator {
  constructor(key) {
    this.key = key;
    this.seen = new Map();
    this.count = 0;
  }
  
  add(items) {
    return items.filter(item => {
      const keyValue = item[this.key];
      if (this.seen.has(keyValue)) {
        return false;
      }
      this.seen.set(keyValue, ++this.count); // 记录添加顺序
      return true;
    });
  }
  
  getAddedOrder(keyValue) {
    return this.seen.get(keyValue);
  }
}

5.3 内存优化版本

javascript 复制代码

function memoryEfficientUnique(arr, key) {
  const seen = new Map();
  const result = [];
  
  // 使用WeakMap处理对象键
  const weakMap = new WeakMap();
  
  for (let i = 0; i < arr.length; i++) {
    const item = arr[i];
    const keyValue = item[key];
    
    // 对于对象类型的键值，使用WeakMap
    if (typeof keyValue === 'object' && keyValue !== null) {
      if (!weakMap.has(keyValue)) {
        weakMap.set(keyValue, true);
        result.push(item);
      }
    } else {
      if (!seen.has(keyValue)) {
        seen.set(keyValue, true);
        result.push(item);
      }
    }
  }
  
  return result;
}

6. 实战案例分析

6.1 电商商品去重

javascript 复制代码

// 场景：合并多个来源的商品数据
const productsFromAPI = [...];
const productsFromCache = [...];
const userUploadedProducts = [...];

// 需求：按商品SKU去重，保持最新数据
function mergeProducts(productLists) {
  const merged = [];
  const skuMap = new Map();
  
  // 按优先级处理（后处理的优先级高）
  productLists.forEach(list => {
    list.forEach(product => {
      const existing = skuMap.get(product.sku);
      if (!existing || product.updatedAt > existing.updatedAt) {
        if (existing) {
          // 移除旧的
          const index = merged.findIndex(p => p.sku === product.sku);
          merged.splice(index, 1);
        }
        merged.push(product);
        skuMap.set(product.sku, product);
      }
    });
  });
  
  return merged;
}

6.2 实时消息去重

javascript 复制代码

// 场景：聊天应用消息去重
class MessageDeduplicator {
  constructor(timeWindow = 5000) {
    this.timeWindow = timeWindow;
    this.messageIds = new Set();
    this.timestamps = new Map();
  }
  
  addMessage(message) {
    const now = Date.now();
    const { id } = message;
    
    // 清理过期记录
    this.cleanup(now);
    
    // 检查是否重复
    if (this.messageIds.has(id)) {
      return false;
    }
    
    // 添加新记录
    this.messageIds.add(id);
    this.timestamps.set(id, now);
    return true;
  }
  
  cleanup(now) {
    for (const [id, timestamp] of this.timestamps) {
      if (now - timestamp > this.timeWindow) {
        this.messageIds.delete(id);
        this.timestamps.delete(id);
      }
    }
  }
}

结论

回到最初的问题：JS复杂去重一定要先排序吗？

答案是否定的。排序只是众多去重策略中的一种，而非必需步骤。

我的建议:

默认使用Map方案: 对于大多数场景，基于Map或Set的去重方法在性能和功能上都是最佳选择。
根据需求选择:

需要保持顺序 → 使用Map
需要排序结果 → 先排序或后排序
数据量很大 → 考虑迭代器或流式处理
内存敏感 → 使用WeakMap或定期清理

考虑可读性和维护性: 有时清晰的代码比微小的性能优化更重要。
进行实际测试: 在性能关键路径上，用真实数据测试不同方案。

实践总结:

javascript 复制代码

// 通用推荐方案
function deduplicate(arr, identifier = v => v) {
  const seen = new Set();
  return arr.filter(item => {
    const key = typeof identifier === 'function' 
      ? identifier(item)
      : item[identifier];
    
    if (seen.has(key)) return false;
    seen.add(key);
    return true;
  });
}

// 需要排序时的方案
function deduplicateAndSort(arr, key, sortBy) {
  const unique = deduplicate(arr, key);
  return unique.sort((a, b) => {
    const aVal = a[sortBy];
    const bVal = b[sortBy];
    return aVal < bVal ? -1 : aVal > bVal ? 1 : 0;
  });
}

记住，没有银弹。最合适的去重方案取决于你的具体需求：数据规模、顺序要求、性能需求和代码上下文。希望这篇文章能帮助你在面对复杂去重问题时做出明智的选择！