之前也有做过计算文件MD5做相关校验,最近业务上再次遇上,同时需要大量文件同时计算,难免需要优化相关性能。这里用到分片计算,同时将计算过程放到webworker,避免长时间占用主线程造成卡顿,并且通过队列控制并发,避免占用过多计算机资源,造成浏览器卡顿。
整个文件直接计算MD5 import SparkMD5 from 'spark-md5' const spark = new SparkMD5 .ArrayBuffer () export const createMD5 = file => { return new Promise ((resolve,reject )=> { const fileReader = new FileReader () fileReader.readAsArrayBuffer (file) fileReader.onerror = e => { console .error (e) return reject (e) } fileReader.onload = () => { try { resolve (spark.hash (dataBuffer)) }catch (e){ console .error (e) return reject (e) } } }) }
直接加载整个文件,将整个文件的二进制数据arrayBuffer丢给计算工具计算MD5,如果是一个小文件就还好,但是大文件,批量的文件,一下子将大量资源放到内存里会造成明显的卡顿 .
分片计算MD5 spark-md5推荐:Incremental md5 performs a lot better for hashing large amounts of data, such as files. One could read files in chunks, using the FileReader & Blob’s, and append each chunk for md5 hashing while keeping memory usage low.
增量 md5在散列大量数据(比如文件)时表现得更好。可以使用 FileReader & Blob 的块来读取文件,并为 md5哈希添加每个块,同时保持较低的内存使用。
通过File.slice将文件分片,一块一块地增量处理spark.append(chunkArrayBuffer),最后通过spark.end()获取最终的文件md5,可以保持低内存使用,当然,我们平时讲的文件分片上传等也是基于File.slice。
上代码
import SparkMD5 from 'spark-md5' const maxChunkSize = 1024 * 1024 const chunksPerCycle = 100 export const createMD5 = file => { file = file.raw || file let stamp = Date .now () console .time ('md计算耗时' + stamp) return new Promise ((resolve, reject ) => { const fileReader = new FileReader () try { let currentChunk = 0 const totalChunks = Math .ceil (file.size / maxChunkSize) const spark = new SparkMD5 .ArrayBuffer () const processChunk = start => { try { const blobSlice = File .prototype .slice || File .prototype .mozSlice || File .prototype .webkitSlice const end = Math .min (start + maxChunkSize, file.size ) const chunk = blobSlice.call (file, start, end) fileReader.readAsArrayBuffer (chunk) } catch (e) { console .error (e) } } fileReader.onload = () => { spark.append (fileReader.result ) currentChunk += 1 if (currentChunk >= totalChunks) { const md5Hash = spark.end () spark.destroy () fileReader.abort () console .timeEnd ('计算MD5' ) return resolve (md5Hash) } else if (currentChunk % chunksPerCycle === 0 ) { setTimeout (() => { processChunk (currentChunk * maxChunkSize) }, 0 ) } else { processChunk (currentChunk * maxChunkSize) } } fileReader.onerror = e => { console .error (e) return reject (e) } processChunk (0 ) } catch (e) { fileReader.abort () reject (e) } finally { } }) }
借助WebWorker优化 md5计算属于高密度计算,如果直接放在js主线程,长时间占用造成卡顿难免对用户体验不好,这里也可以发挥出webworker的优势,本来webworker就是为了线程可以执行任务而不干扰用户界面而生。
这里也有两种方式,
第一种,将File直接传给worker,在worker里面分片读取计算,但是这样的话,因为File对象这种二进制数据直接传递给worker,浏览器会先给FIle做一层拷贝,但是拷贝方式多少会造成性能问题。
当然,二进制数据可以通过Transferable Objects ,也就是转移数据的方法,主线程把二进制数据直接转移给子线程,但是一旦转移,主线程就无法再使用这些二进制数据了(防止出现多个线程同时修改数据的局面)。那么我们能不能将File通过这种方式传过去呢?不行,Transferable Objects只支持以下几个类型,就算支持将File转过去,我们还得想一个问题就是,将FIle转过去,主线程就不能用这个File对象了,那你后续怎么做进一步的处理,上传啥的。
以下是可以被转移 的不同规范的对象:
第二种,反正也是切片,我这里就直接在主线程切片读取,将读取后的chunkBuffer通过Transferable Objects传递到worker线程。
worker代码:
import SparkMD5 from 'spark-md5' const spark = new SparkMD5 .ArrayBuffer () self.addEventListener ('message' , ({ data } ) => { const { dataBuffer, status } = data try { if (status === 'ing' ) { spark.append (dataBuffer) } else if (status === 'end' ) { self.postMessage ({ md5 : spark.end (), status : 'success' }) spark.destroy () } } catch (e) { self.postMessage ({ status : 'error' , error : e }) console .error (e) } })
主线程:
const maxChunkSize = 1024 * 1024 const chunksPerCycle = 100 export const createMD5 = file => { file = file.raw || file let stamp = Date .now () console .time ('md计算耗时' + stamp) let worker = new Worker ( new URL ('../worker/md5-encode.worker.js' , import .meta .url ) ) return new Promise ((resolve, reject ) => { const fileReader = new FileReader () try { let currentChunk = 0 const totalChunks = Math .ceil (file.size / maxChunkSize) const processChunk = start => { try { let blobSlice = File .prototype .slice || File .prototype .mozSlice || File .prototype .webkitSlice const end = Math .min (start + maxChunkSize, file.size ) const chunk = blobSlice.call (file, start, end) fileReader.readAsArrayBuffer (chunk) } catch (e) { console .error (e) } } fileReader.onload = () => { worker.postMessage ({ dataBuffer : fileReader.result , status : 'ing' }, [fileReader.result ]) currentChunk += 1 if (currentChunk >= totalChunks) { worker.postMessage ({ status : 'end' }) fileReader.abort () return } else if (currentChunk % chunksPerCycle === 0 ) { setTimeout (() => { processChunk (currentChunk * maxChunkSize) }, 0 ) } else { processChunk (currentChunk * maxChunkSize) } } fileReader.onerror = e => { console .error (e) return reject (e) } worker.onmessage = ({ data } ) => { const { md5, status, error } = data if (status === 'success' ) { resolve (md5) } else { reject (error) } console .timeEnd ('md计算耗时' + stamp) worker.terminate () } processChunk (0 ) } catch (e) { fileReader.abort () reject (e) } finally { } }) }
第二种方式地第一种直接传File对象会快出File拷贝的时间,处理一个300多m的文件,整体会快几百ms到1s。
借助队列函数进一步优化 现在计算md5不会堵塞主线程了,但是如果大量文件同时进行分片读取,同样会造成一定的卡顿,同时,大家也要知道,worker开到一定数量也是会造成浏览器卡顿的,和电脑的资源大小有关,所以可以通过队列控制计算md5方法的并发量,避免一下子占用太多资源。
以下是我写的一个队列函数,结合一下上面的createMD5即可达到目的。
export const createQueue = (concurrency, fn ) => { const queue = [] const runningQueue = [] const removeQueue = task => { const index = queue.findIndex (item => item === task) if (index !== -1 ) { console .log ('取消排队' ) queue.splice (index, 1 ) } } const process = (dataItem, getRemoveQueueSource ) => { return new Promise ((resolve, reject ) => { const run = async ( ) => { if (runningQueue.length >= concurrency) { queue.push (run) getRemoveQueueSource && getRemoveQueueSource (() => removeQueue (run)) return } runningQueue.push (run) try { const result = await fn (dataItem) resolve (result) } catch (e) { reject (e) } finally { runningQueue.splice (runningQueue.indexOf (run), 1 ) if (queue.length ) { queue.shift ()() } } } run () }) } return process }
使用:
const createMD5ByQueue = createQueue (10 , createMD5)
终极武器-WebAssembly 最近看掘金发现了有人把md5计算用wasm来做,用rust语言来计算md5,编译成wasm来浏览器使用,我测了一下,平均要继续快1s,也是,js算得怎么能比高级语言快。
大概步骤是用rust写计算流程,编译成wasm,然后放我们项目中使用,当然,过程肯定没那么简单,拿来主义,github上有一个现成的库,hash-wasm—Daninet/hash-wasm: Lightning fast hash functions using hand-tuned WebAssembly binaries (github.com)
直接引入使用就好啦,当然,要注意wasm的兼容性,我这里使用了优雅降级,如果支持就用hash-wasm,不支持就用spark-md5
代码:
主线程:
const maxChunkSize = 1024 * 1024 const chunksPerCycle = 100 export const createMD5 = file => { file = file.raw || file let stamp = Date .now () console .time ('md计算耗时' + stamp) let worker = new Worker ( new URL ('../worker/md5-encode.worker.js' , import .meta .url ) ) return new Promise ((resolve, reject ) => { const fileReader = new FileReader () try { let currentChunk = 0 const totalChunks = Math .ceil (file.size / maxChunkSize) const processChunk = start => { try { let blobSlice = File .prototype .slice || File .prototype .mozSlice || File .prototype .webkitSlice const end = Math .min (start + maxChunkSize, file.size ) const chunk = blobSlice.call (file, start, end) fileReader.readAsArrayBuffer (chunk) } catch (e) { console .error (e) } } fileReader.onload = () => { worker.postMessage ({ dataBuffer : fileReader.result , status : 'ing' }, [fileReader.result ]) currentChunk += 1 if (currentChunk >= totalChunks) { worker.postMessage ({ status : 'end' }) } else if (currentChunk % chunksPerCycle === 0 ) { setTimeout (() => { processChunk (currentChunk * maxChunkSize) }, 0 ) } else { processChunk (currentChunk * maxChunkSize) } } fileReader.onerror = e => { console .error (e) return reject (e) } worker.onerror = e => { console .log (e) fileReader.abort () reject (e) } worker.onmessage = ({ data } ) => { const { md5, status, error } = data if (status === 'createSuccess' ) { processChunk (0 ) return } if (status === 'success' ) { resolve (md5) } else { reject (error) } fileReader.abort () console .timeEnd ('md计算耗时' + stamp) worker.terminate () } worker.postMessage ({ status : 'create' }) } catch (e) { console .error (e) fileReader.abort () reject (e) } finally { } }) }
worker:
import SparkMD5 from 'spark-md5' import { createMD5 } from 'hash-wasm' let hasherconst createMd5 = async type => { try { hasher = new Hashser () await hasher.create (type) self.postMessage ({ status : 'createSuccess' }) } catch (e) { self.postMessage ({ status : 'error' , error : e }) } }class Hashser { maker type async create (type ) { if (!type) { if (typeof WebAssembly === 'object' && typeof WebAssembly .instantiate === 'function' ) { console .log ('当前浏览器支持WebAssembly' ) this .type = 'hash-wasm' this .maker = await createMD5 () this .maker .init () } else { console .log ('当前浏览器不支持WebAssembly' ) this .type = 'spark-md5' this .maker = new SparkMD5 .ArrayBuffer () } } this .type = type if (type === 'spark-md5' ) { this .maker = new SparkMD5 .ArrayBuffer () } else { this .maker = await createMD5 () this .maker .init () } } append (dataBuffer ) { if (this .type === 'spark-md5' ) { this .maker .append (dataBuffer) } else { this .maker .update (new Uint8Array (dataBuffer)) } } end ( ) { if (this .type === 'spark-md5' ) { const result = this .maker .end () this .maker .destroy () return result } else { return this .maker .digest () } } } self.addEventListener ('message' , ({ data } ) => { const { dataBuffer, status } = data try { if (status === 'create' ) { createMd5 () } if (status === 'ing' ) { hasher.append (dataBuffer) } else if (status === 'end' ) { self.postMessage ({ md5 : hasher.end (), status : 'success' }) } } catch (e) { self.postMessage ({ status : 'error' , error : e }) console .error (e) } })
如有不妥,多多指教!