首页 / 算法 / 利用归并排序算法对大文件进行排序
利用归并排序算法对大文件进行排序
内容导读
互联网集市收集整理的这篇技术教程文章主要介绍了利用归并排序算法对大文件进行排序,小编现在分享给大家,供广大互联网技能从业者学习和参考。文章包含6481字,纯文字阅读大概需要10分钟。
内容图文
![利用归并排序算法对大文件进行排序](/upload/InfoBanner/zyjiaocheng/1194/b71bb9bd54734958aafc875bef665112.jpg)
归并排序算法介绍,请参照Wikipeida
zh.wikipedia.org/wiki/%E5%BD%92%E5%B9%B6%E6%8E%92%E5%BA%8F
基本思想:
大文件分割成行数相等的两个子文件,递归(归并排序)两个子文件,直到递归到分割成的子文件低于限制行数
低于限制行数的子文件直接排序
两个排序好的子文件归并到父文件
直到最后所有排序好的父文件归并到输入的大文件并返回
之前看了网上很多示例代码,写的很不简洁, 引入了过多的临时变量i, j, k等等, 导致程序基本没法看,
只好自己写了一个,没有很关心执行效率, 只求够用,以后有机会再优化一下吧。
Performance:
输入999999行
cost: 10140 MILLISECONDS
cost: 10 MICROSECONDS per line
JDK要求
Java 8
package com.java.sort.merge; import com.google.common.base.Charsets; import com.google.common.base.Stopwatch; import com.google.common.base.Strings; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterators; import com.google.common.collect.PeekingIterator; import com.google.common.io.Files; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.io.LineIterator; import org.apache.commons.io.filefilter.AndFileFilter; import org.apache.commons.io.filefilter.PrefixFileFilter; import org.apache.commons.io.filefilter.SuffixFileFilter; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; import java.io.File; import java.io.FilenameFilter; import java.io.IOException; import java.io.Writer; import java.util.ArrayList; import java.util.List; import java.util.concurrent.TimeUnit; public class FileMergeSort { private static final Logger log = LogManager.getLogger(); private static final long total = 999999L; private static final int limit = 9999; private static void cleanTempFiles() { FilenameFilter filter = new AndFileFilter(ImmutableList.of(new PrefixFileFilter("sort"), new SuffixFileFilter(".part"))); ImmutableList.copyOf(FileUtils.getTempDirectory().listFiles(filter)).forEach(File::delete); } private static int lineNumber(File input) throws IOException { int count = 0; LineIterator iterator = FileUtils.lineIterator(input); while (iterator.hasNext()) { iterator.next(); count++; } return count; } private static File split(File input, int from, int to) throws IOException { File part = File.createTempFile("sort", ".part"); Long lineNumber = 0L; String line = null; List<String> lines = new ArrayList<>(to - from); LineIterator iterator = FileUtils.lineIterator(input); while (iterator.hasNext()) { if (lineNumber > to) break; line = iterator.next(); if (lineNumber >= from && lineNumber <= to) { lines.add(line); } lineNumber++; } FileUtils.writeLines(part, lines); return part; } private static File merge(File source, File left, File right) throws IOException { PeekingIterator<String> leftLineIterator = Iterators.peekingIterator(FileUtils.lineIterator(left)); PeekingIterator<String> rightLineIterator = Iterators.peekingIterator(FileUtils.lineIterator(right)); String leftLine, rightLine; try (Writer writer = Files.newWriter(source, Charsets.UTF_8)) { writer.write(""); while (leftLineIterator.hasNext() && rightLineIterator.hasNext()) { leftLine = leftLineIterator.peek(); rightLine = rightLineIterator.peek(); if (leftLine.compareTo(rightLine) < 0) { writer.append(leftLine.concat(IOUtils.LINE_SEPARATOR)); leftLineIterator.next(); } else { writer.append(rightLine.concat(IOUtils.LINE_SEPARATOR)); rightLineIterator.next(); } } while (leftLineIterator.hasNext()) { writer.append(leftLineIterator.next().concat(IOUtils.LINE_SEPARATOR)); } while (rightLineIterator.hasNext()) { writer.append(rightLineIterator.next().concat(IOUtils.LINE_SEPARATOR)); } } return source; } private static File directSort(File input) throws IOException { List<String> list = new ArrayList<>(limit); FileUtils.lineIterator(input).forEachRemaining(list::add); list.sort(String::compareTo); FileUtils.writeLines(input, list); return input; } public static File mergeSort(File input) throws IOException { int total = lineNumber(input); if (total <= limit) { return directSort(input); } int half = total / 2; File left = mergeSort(split(input, 0, half)); File right = mergeSort(split(input, half + 1, total)); return merge(input, left, right); } @BeforeClass public static void init() throws IOException { cleanTempFiles(); int minLength = String.valueOf(total).length(); try (Writer writer = Files.newWriter(new File("long.txt"), Charsets.UTF_8)) { writer.write(""); for (long i = total; i > 0L; i--) { writer.append(Strings.padStart(String.valueOf(i), minLength, ‘0‘).concat(IOUtils.LINE_SEPARATOR)); } } } @AfterClass public static void clean() { cleanTempFiles(); } @Test public void testSort() throws IOException { Stopwatch watch = Stopwatch.createStarted(); File sorted = mergeSort(new File("long.txt")); watch.stop(); log.info(String.format("cost: %s MILLISECONDS", watch.elapsed(TimeUnit.MILLISECONDS))); log.info(String.format("cost: %s MICROSECONDS per line", watch.elapsed(TimeUnit.MICROSECONDS) / total)); } }
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.java.app</groupId> <artifactId>sample</artifactId> <version>1.0-SNAPSHOT</version> <dependencies> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.4</version> </dependency> <dependency> <groupId>com.google.guava</groupId> <artifactId>guava</artifactId> <version>18.0</version> </dependency> <dependency> <groupId>org.apache.logging.log4j</groupId> <artifactId>log4j-api</artifactId> <version>2.1</version> </dependency> <dependency> <groupId>org.apache.logging.log4j</groupId> <artifactId>log4j-core</artifactId> <version>2.1</version> </dependency> <dependency> <groupId>org.apache.logging.log4j</groupId> <artifactId>log4j-jcl</artifactId> <version>2.1</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> <scope>test</scope> </dependency> </dependencies> </project>
原文:http://ivarchen.iteye.com/blog/2179500
内容总结
以上是互联网集市为您收集整理的利用归并排序算法对大文件进行排序全部内容,希望文章能够帮你解决利用归并排序算法对大文件进行排序所遇到的程序开发问题。 如果觉得互联网集市技术教程内容还不错,欢迎将互联网集市网站推荐给程序员好友。
内容备注
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 gblab@vip.qq.com 举报,一经查实,本站将立刻删除。
内容手机端
扫描二维码推送至手机访问。