Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +0 -34
- .gitignore +400 -0
- CODE_OF_CONDUCT.md +9 -0
- LICENSE +21 -0
- README.md +152 -3
- SECURITY.md +41 -0
- SUPPORT.md +25 -0
- config.json +70 -0
- configuration_deepseek.py +210 -0
- git_add_safetensors.sh +174 -0
- model-00001-of-000163.safetensors +3 -0
- model-00002-of-000163.safetensors +3 -0
- model-00003-of-000163.safetensors +3 -0
- model-00004-of-000163.safetensors +3 -0
- model-00005-of-000163.safetensors +3 -0
- model-00006-of-000163.safetensors +3 -0
- model-00007-of-000163.safetensors +3 -0
- model-00008-of-000163.safetensors +3 -0
- model-00009-of-000163.safetensors +3 -0
- model-00010-of-000163.safetensors +3 -0
- model-00011-of-000163.safetensors +3 -0
- model-00012-of-000163.safetensors +3 -0
- model-00013-of-000163.safetensors +3 -0
- model-00014-of-000163.safetensors +3 -0
- model-00015-of-000163.safetensors +3 -0
- model-00016-of-000163.safetensors +3 -0
- model-00017-of-000163.safetensors +3 -0
- model-00018-of-000163.safetensors +3 -0
- model-00019-of-000163.safetensors +3 -0
- model-00020-of-000163.safetensors +3 -0
- model-00021-of-000163.safetensors +3 -0
- model-00022-of-000163.safetensors +3 -0
- model-00023-of-000163.safetensors +3 -0
- model-00024-of-000163.safetensors +3 -0
- model-00025-of-000163.safetensors +3 -0
- model-00026-of-000163.safetensors +3 -0
- model-00027-of-000163.safetensors +3 -0
- model-00028-of-000163.safetensors +3 -0
- model-00029-of-000163.safetensors +3 -0
- model-00030-of-000163.safetensors +3 -0
- model-00031-of-000163.safetensors +3 -0
- model-00032-of-000163.safetensors +3 -0
- model-00033-of-000163.safetensors +3 -0
- model-00034-of-000163.safetensors +3 -0
- model-00035-of-000163.safetensors +3 -0
- model-00036-of-000163.safetensors +3 -0
- model-00037-of-000163.safetensors +3 -0
- model-00038-of-000163.safetensors +3 -0
- model-00039-of-000163.safetensors +3 -0
- model-00040-of-000163.safetensors +3 -0
.gitattributes
CHANGED
@@ -1,35 +1 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
@@ -0,0 +1,400 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Ignore Visual Studio temporary files, build results, and
|
2 |
+
## files generated by popular Visual Studio add-ons.
|
3 |
+
##
|
4 |
+
## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
|
5 |
+
|
6 |
+
# User-specific files
|
7 |
+
*.rsuser
|
8 |
+
*.suo
|
9 |
+
*.user
|
10 |
+
*.userosscache
|
11 |
+
*.sln.docstates
|
12 |
+
|
13 |
+
# User-specific files (MonoDevelop/Xamarin Studio)
|
14 |
+
*.userprefs
|
15 |
+
|
16 |
+
# Mono auto generated files
|
17 |
+
mono_crash.*
|
18 |
+
|
19 |
+
# Build results
|
20 |
+
[Dd]ebug/
|
21 |
+
[Dd]ebugPublic/
|
22 |
+
[Rr]elease/
|
23 |
+
[Rr]eleases/
|
24 |
+
x64/
|
25 |
+
x86/
|
26 |
+
[Ww][Ii][Nn]32/
|
27 |
+
[Aa][Rr][Mm]/
|
28 |
+
[Aa][Rr][Mm]64/
|
29 |
+
bld/
|
30 |
+
[Bb]in/
|
31 |
+
[Oo]bj/
|
32 |
+
[Ll]og/
|
33 |
+
[Ll]ogs/
|
34 |
+
|
35 |
+
# Visual Studio 2015/2017 cache/options directory
|
36 |
+
.vs/
|
37 |
+
# Uncomment if you have tasks that create the project's static files in wwwroot
|
38 |
+
#wwwroot/
|
39 |
+
|
40 |
+
# Visual Studio 2017 auto generated files
|
41 |
+
Generated\ Files/
|
42 |
+
|
43 |
+
# MSTest test Results
|
44 |
+
[Tt]est[Rr]esult*/
|
45 |
+
[Bb]uild[Ll]og.*
|
46 |
+
|
47 |
+
# NUnit
|
48 |
+
*.VisualState.xml
|
49 |
+
TestResult.xml
|
50 |
+
nunit-*.xml
|
51 |
+
|
52 |
+
# Build Results of an ATL Project
|
53 |
+
[Dd]ebugPS/
|
54 |
+
[Rr]eleasePS/
|
55 |
+
dlldata.c
|
56 |
+
|
57 |
+
# Benchmark Results
|
58 |
+
BenchmarkDotNet.Artifacts/
|
59 |
+
|
60 |
+
# .NET Core
|
61 |
+
project.lock.json
|
62 |
+
project.fragment.lock.json
|
63 |
+
artifacts/
|
64 |
+
|
65 |
+
# ASP.NET Scaffolding
|
66 |
+
ScaffoldingReadMe.txt
|
67 |
+
|
68 |
+
# StyleCop
|
69 |
+
StyleCopReport.xml
|
70 |
+
|
71 |
+
# Files built by Visual Studio
|
72 |
+
*_i.c
|
73 |
+
*_p.c
|
74 |
+
*_h.h
|
75 |
+
*.ilk
|
76 |
+
*.meta
|
77 |
+
*.obj
|
78 |
+
*.iobj
|
79 |
+
*.pch
|
80 |
+
*.pdb
|
81 |
+
*.ipdb
|
82 |
+
*.pgc
|
83 |
+
*.pgd
|
84 |
+
*.rsp
|
85 |
+
# but not Directory.Build.rsp, as it configures directory-level build defaults
|
86 |
+
!Directory.Build.rsp
|
87 |
+
*.sbr
|
88 |
+
*.tlb
|
89 |
+
*.tli
|
90 |
+
*.tlh
|
91 |
+
*.tmp
|
92 |
+
*.tmp_proj
|
93 |
+
*_wpftmp.csproj
|
94 |
+
*.log
|
95 |
+
*.tlog
|
96 |
+
*.vspscc
|
97 |
+
*.vssscc
|
98 |
+
.builds
|
99 |
+
*.pidb
|
100 |
+
*.svclog
|
101 |
+
*.scc
|
102 |
+
|
103 |
+
# Chutzpah Test files
|
104 |
+
_Chutzpah*
|
105 |
+
|
106 |
+
# Visual C++ cache files
|
107 |
+
ipch/
|
108 |
+
*.aps
|
109 |
+
*.ncb
|
110 |
+
*.opendb
|
111 |
+
*.opensdf
|
112 |
+
*.sdf
|
113 |
+
*.cachefile
|
114 |
+
*.VC.db
|
115 |
+
*.VC.VC.opendb
|
116 |
+
|
117 |
+
# Visual Studio profiler
|
118 |
+
*.psess
|
119 |
+
*.vsp
|
120 |
+
*.vspx
|
121 |
+
*.sap
|
122 |
+
|
123 |
+
# Visual Studio Trace Files
|
124 |
+
*.e2e
|
125 |
+
|
126 |
+
# TFS 2012 Local Workspace
|
127 |
+
$tf/
|
128 |
+
|
129 |
+
# Guidance Automation Toolkit
|
130 |
+
*.gpState
|
131 |
+
|
132 |
+
# ReSharper is a .NET coding add-in
|
133 |
+
_ReSharper*/
|
134 |
+
*.[Rr]e[Ss]harper
|
135 |
+
*.DotSettings.user
|
136 |
+
|
137 |
+
# TeamCity is a build add-in
|
138 |
+
_TeamCity*
|
139 |
+
|
140 |
+
# DotCover is a Code Coverage Tool
|
141 |
+
*.dotCover
|
142 |
+
|
143 |
+
# AxoCover is a Code Coverage Tool
|
144 |
+
.axoCover/*
|
145 |
+
!.axoCover/settings.json
|
146 |
+
|
147 |
+
# Coverlet is a free, cross platform Code Coverage Tool
|
148 |
+
coverage*.json
|
149 |
+
coverage*.xml
|
150 |
+
coverage*.info
|
151 |
+
|
152 |
+
# Visual Studio code coverage results
|
153 |
+
*.coverage
|
154 |
+
*.coveragexml
|
155 |
+
|
156 |
+
# NCrunch
|
157 |
+
_NCrunch_*
|
158 |
+
.*crunch*.local.xml
|
159 |
+
nCrunchTemp_*
|
160 |
+
|
161 |
+
# MightyMoose
|
162 |
+
*.mm.*
|
163 |
+
AutoTest.Net/
|
164 |
+
|
165 |
+
# Web workbench (sass)
|
166 |
+
.sass-cache/
|
167 |
+
|
168 |
+
# Installshield output folder
|
169 |
+
[Ee]xpress/
|
170 |
+
|
171 |
+
# DocProject is a documentation generator add-in
|
172 |
+
DocProject/buildhelp/
|
173 |
+
DocProject/Help/*.HxT
|
174 |
+
DocProject/Help/*.HxC
|
175 |
+
DocProject/Help/*.hhc
|
176 |
+
DocProject/Help/*.hhk
|
177 |
+
DocProject/Help/*.hhp
|
178 |
+
DocProject/Help/Html2
|
179 |
+
DocProject/Help/html
|
180 |
+
|
181 |
+
# Click-Once directory
|
182 |
+
publish/
|
183 |
+
|
184 |
+
# Publish Web Output
|
185 |
+
*.[Pp]ublish.xml
|
186 |
+
*.azurePubxml
|
187 |
+
# Note: Comment the next line if you want to checkin your web deploy settings,
|
188 |
+
# but database connection strings (with potential passwords) will be unencrypted
|
189 |
+
*.pubxml
|
190 |
+
*.publishproj
|
191 |
+
|
192 |
+
# Microsoft Azure Web App publish settings. Comment the next line if you want to
|
193 |
+
# checkin your Azure Web App publish settings, but sensitive information contained
|
194 |
+
# in these scripts will be unencrypted
|
195 |
+
PublishScripts/
|
196 |
+
|
197 |
+
# NuGet Packages
|
198 |
+
*.nupkg
|
199 |
+
# NuGet Symbol Packages
|
200 |
+
*.snupkg
|
201 |
+
# The packages folder can be ignored because of Package Restore
|
202 |
+
**/[Pp]ackages/*
|
203 |
+
# except build/, which is used as an MSBuild target.
|
204 |
+
!**/[Pp]ackages/build/
|
205 |
+
# Uncomment if necessary however generally it will be regenerated when needed
|
206 |
+
#!**/[Pp]ackages/repositories.config
|
207 |
+
# NuGet v3's project.json files produces more ignorable files
|
208 |
+
*.nuget.props
|
209 |
+
*.nuget.targets
|
210 |
+
|
211 |
+
# Microsoft Azure Build Output
|
212 |
+
csx/
|
213 |
+
*.build.csdef
|
214 |
+
|
215 |
+
# Microsoft Azure Emulator
|
216 |
+
ecf/
|
217 |
+
rcf/
|
218 |
+
|
219 |
+
# Windows Store app package directories and files
|
220 |
+
AppPackages/
|
221 |
+
BundleArtifacts/
|
222 |
+
Package.StoreAssociation.xml
|
223 |
+
_pkginfo.txt
|
224 |
+
*.appx
|
225 |
+
*.appxbundle
|
226 |
+
*.appxupload
|
227 |
+
|
228 |
+
# Visual Studio cache files
|
229 |
+
# files ending in .cache can be ignored
|
230 |
+
*.[Cc]ache
|
231 |
+
# but keep track of directories ending in .cache
|
232 |
+
!?*.[Cc]ache/
|
233 |
+
|
234 |
+
# Others
|
235 |
+
ClientBin/
|
236 |
+
~$*
|
237 |
+
*~
|
238 |
+
*.dbmdl
|
239 |
+
*.dbproj.schemaview
|
240 |
+
*.jfm
|
241 |
+
*.pfx
|
242 |
+
*.publishsettings
|
243 |
+
orleans.codegen.cs
|
244 |
+
|
245 |
+
# Including strong name files can present a security risk
|
246 |
+
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
|
247 |
+
#*.snk
|
248 |
+
|
249 |
+
# Since there are multiple workflows, uncomment next line to ignore bower_components
|
250 |
+
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
|
251 |
+
#bower_components/
|
252 |
+
|
253 |
+
# RIA/Silverlight projects
|
254 |
+
Generated_Code/
|
255 |
+
|
256 |
+
# Backup & report files from converting an old project file
|
257 |
+
# to a newer Visual Studio version. Backup files are not needed,
|
258 |
+
# because we have git ;-)
|
259 |
+
_UpgradeReport_Files/
|
260 |
+
Backup*/
|
261 |
+
UpgradeLog*.XML
|
262 |
+
UpgradeLog*.htm
|
263 |
+
ServiceFabricBackup/
|
264 |
+
*.rptproj.bak
|
265 |
+
|
266 |
+
# SQL Server files
|
267 |
+
*.mdf
|
268 |
+
*.ldf
|
269 |
+
*.ndf
|
270 |
+
|
271 |
+
# Business Intelligence projects
|
272 |
+
*.rdl.data
|
273 |
+
*.bim.layout
|
274 |
+
*.bim_*.settings
|
275 |
+
*.rptproj.rsuser
|
276 |
+
*- [Bb]ackup.rdl
|
277 |
+
*- [Bb]ackup ([0-9]).rdl
|
278 |
+
*- [Bb]ackup ([0-9][0-9]).rdl
|
279 |
+
|
280 |
+
# Microsoft Fakes
|
281 |
+
FakesAssemblies/
|
282 |
+
|
283 |
+
# GhostDoc plugin setting file
|
284 |
+
*.GhostDoc.xml
|
285 |
+
|
286 |
+
# Node.js Tools for Visual Studio
|
287 |
+
.ntvs_analysis.dat
|
288 |
+
node_modules/
|
289 |
+
|
290 |
+
# Visual Studio 6 build log
|
291 |
+
*.plg
|
292 |
+
|
293 |
+
# Visual Studio 6 workspace options file
|
294 |
+
*.opt
|
295 |
+
|
296 |
+
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
|
297 |
+
*.vbw
|
298 |
+
|
299 |
+
# Visual Studio 6 auto-generated project file (contains which files were open etc.)
|
300 |
+
*.vbp
|
301 |
+
|
302 |
+
# Visual Studio 6 workspace and project file (working project files containing files to include in project)
|
303 |
+
*.dsw
|
304 |
+
*.dsp
|
305 |
+
|
306 |
+
# Visual Studio 6 technical files
|
307 |
+
*.ncb
|
308 |
+
*.aps
|
309 |
+
|
310 |
+
# Visual Studio LightSwitch build output
|
311 |
+
**/*.HTMLClient/GeneratedArtifacts
|
312 |
+
**/*.DesktopClient/GeneratedArtifacts
|
313 |
+
**/*.DesktopClient/ModelManifest.xml
|
314 |
+
**/*.Server/GeneratedArtifacts
|
315 |
+
**/*.Server/ModelManifest.xml
|
316 |
+
_Pvt_Extensions
|
317 |
+
|
318 |
+
# Paket dependency manager
|
319 |
+
.paket/paket.exe
|
320 |
+
paket-files/
|
321 |
+
|
322 |
+
# FAKE - F# Make
|
323 |
+
.fake/
|
324 |
+
|
325 |
+
# CodeRush personal settings
|
326 |
+
.cr/personal
|
327 |
+
|
328 |
+
# Python Tools for Visual Studio (PTVS)
|
329 |
+
__pycache__/
|
330 |
+
*.pyc
|
331 |
+
|
332 |
+
# Cake - Uncomment if you are using it
|
333 |
+
# tools/**
|
334 |
+
# !tools/packages.config
|
335 |
+
|
336 |
+
# Tabs Studio
|
337 |
+
*.tss
|
338 |
+
|
339 |
+
# Telerik's JustMock configuration file
|
340 |
+
*.jmconfig
|
341 |
+
|
342 |
+
# BizTalk build output
|
343 |
+
*.btp.cs
|
344 |
+
*.btm.cs
|
345 |
+
*.odx.cs
|
346 |
+
*.xsd.cs
|
347 |
+
|
348 |
+
# OpenCover UI analysis results
|
349 |
+
OpenCover/
|
350 |
+
|
351 |
+
# Azure Stream Analytics local run output
|
352 |
+
ASALocalRun/
|
353 |
+
|
354 |
+
# MSBuild Binary and Structured Log
|
355 |
+
*.binlog
|
356 |
+
|
357 |
+
# NVidia Nsight GPU debugger configuration file
|
358 |
+
*.nvuser
|
359 |
+
|
360 |
+
# MFractors (Xamarin productivity tool) working folder
|
361 |
+
.mfractor/
|
362 |
+
|
363 |
+
# Local History for Visual Studio
|
364 |
+
.localhistory/
|
365 |
+
|
366 |
+
# Visual Studio History (VSHistory) files
|
367 |
+
.vshistory/
|
368 |
+
|
369 |
+
# BeatPulse healthcheck temp database
|
370 |
+
healthchecksdb
|
371 |
+
|
372 |
+
# Backup folder for Package Reference Convert tool in Visual Studio 2017
|
373 |
+
MigrationBackup/
|
374 |
+
|
375 |
+
# Ionide (cross platform F# VS Code tools) working folder
|
376 |
+
.ionide/
|
377 |
+
|
378 |
+
# Fody - auto-generated XML schema
|
379 |
+
FodyWeavers.xsd
|
380 |
+
|
381 |
+
# VS Code files for those working on multiple tools
|
382 |
+
.vscode/*
|
383 |
+
!.vscode/settings.json
|
384 |
+
!.vscode/tasks.json
|
385 |
+
!.vscode/launch.json
|
386 |
+
!.vscode/extensions.json
|
387 |
+
*.code-workspace
|
388 |
+
|
389 |
+
# Local History for Visual Studio Code
|
390 |
+
.history/
|
391 |
+
|
392 |
+
# Windows Installer files from build outputs
|
393 |
+
*.cab
|
394 |
+
*.msi
|
395 |
+
*.msix
|
396 |
+
*.msm
|
397 |
+
*.msp
|
398 |
+
|
399 |
+
# JetBrains Rider
|
400 |
+
*.sln.iml
|
CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Microsoft Open Source Code of Conduct
|
2 |
+
|
3 |
+
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
|
4 |
+
|
5 |
+
Resources:
|
6 |
+
|
7 |
+
- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
|
8 |
+
- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
|
9 |
+
- Contact [[email protected]](mailto:[email protected]) with questions or concerns
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) Microsoft Corporation.
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE
|
README.md
CHANGED
@@ -1,3 +1,152 @@
|
|
1 |
-
---
|
2 |
-
license: mit
|
3 |
-
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: mit
|
3 |
+
---
|
4 |
+
|
5 |
+
MAI-DS-R1 is a DeepSeek-R1 reasoning model that has been post-trained by the Microsoft AI team to improve its responsiveness on blocked topics and its risk profile, while maintaining its reasoning capabilities and competitive performance.
|
6 |
+
|
7 |
+
## Model Details
|
8 |
+
|
9 |
+
### Model Description
|
10 |
+
MAI-DS-R1 is a DeepSeek-R1 reasoning model that has been post-trained by Microsoft AI team to fill in information gaps in the previous version of the model and to improve its risk profile, while maintaining R1 reasoning capabilities. The model was trained using 110k Safety and Non-Compliance examples from [Tulu](https://huggingface.co/datasets/allenai/tulu-3-sft-mixture) 3 SFT dataset, in addition to a dataset of ~350k multilingual examples internally developed capturing various topics with reported biases.
|
11 |
+
|
12 |
+
MAI-DS-R1 has successfully unblocked the majority of previously blocked queries from the original R1 model while outperforming the recently published R1-1776 model (post-trained by Perplexity) in relevant safety benchmarks. These results were achieved while preserving the general reasoning capabilities of the original DeepSeek-R1.
|
13 |
+
|
14 |
+
*Please note: Microsoft has post-trained this model to address certain limitations relevant to its outputs, but previous limitations and considerations for the model remain, including security considerations.*
|
15 |
+
|
16 |
+
## Uses
|
17 |
+
|
18 |
+
### Direct Use
|
19 |
+
MAI-DS-R1 preserves the general reasoning capabilities of DeepSeek-R1 and can be used for broad language understanding and generation tasks, especially in complex reasoning and problem-solving. Primary direct use incudes:
|
20 |
+
|
21 |
+
- **General text generation and understanding** – Producing coherent, contextually relevant text for a wide range of prompts. This includes engaging in dialogue, writing essays, or continuing a story based on a given prompt.
|
22 |
+
|
23 |
+
- **General knowledge tasks** – Answering open-domain questions requiring factual knowledge.
|
24 |
+
|
25 |
+
- **Reasoning and problem solving** – Handling multi-step reasoning tasks, such as math word problems or logic puzzles, by employing chain-of-thought strategies.
|
26 |
+
|
27 |
+
- **Code generation and comprehension** – Assisting with programming tasks by generating code snippets or explaining code.
|
28 |
+
|
29 |
+
- **Scientific and academic applications** – Assisting with structured problem-solving in STEM and research domains.
|
30 |
+
|
31 |
+
### Downstream Use *(Optional)*
|
32 |
+
|
33 |
+
The model can serve as a foundation for further fine-tuning in domain-specific reasoning tasks, such as automated tutoring systems for mathematics, coding assistants, and research tools in scientific or technical fields.
|
34 |
+
|
35 |
+
### Out-of-Scope Use
|
36 |
+
Certain application domains are out-of-scope either due to ethical/safety concerns or because the model lacks the necessary reliability in those areas. The following usage is out of scope:
|
37 |
+
|
38 |
+
- **Medical or health advice** – The model is not a medical device and has no guarantee of providing accurate medical diagnoses or safe treatment recommendations.
|
39 |
+
|
40 |
+
- **Legal advice** – The model is not a lawyer and should not be entrusted with giving definitive legal counsel, interpreting laws, or making legal decisions on its own.
|
41 |
+
|
42 |
+
- **Safety-critical systems** – The model is not suited for autonomous systems where failures could cause injury, loss of life, or significant property damage. This includes use in self-driving vehicles, aircraft control, medical life-support systems, or industrial control without human oversight.
|
43 |
+
|
44 |
+
- **High-stakes decision support** – The model should not be relied on for decisions affecting finances, security, or personal well-being, such as financial planning or investment advice.
|
45 |
+
|
46 |
+
- **Malicious or unethical Use** – The model must not be used to produce harmful, illegal, deceptive, or unethical content, including hate speech, violence, harassment, or violations of privacy or IP rights.
|
47 |
+
|
48 |
+
## Bias, Risks, and Limitations
|
49 |
+
|
50 |
+
- **Biases**: The model may retain biases present in the training data and in the original DeepSeek‑R1, particularly around cultural and demographic aspects.
|
51 |
+
|
52 |
+
- **Risks**: The model may still hallucinate facts, be vulnerable to adversarial prompts, or generate unsafe, biased, or harmful content under certain conditions. Developers should implement content moderation and usage monitoring to mitigate misuse.
|
53 |
+
|
54 |
+
- **Limitations**: MAI-DS-R1 shares DeepSeek-R1’s knowledge cutoff and may lack awareness of recent events or domain-specific facts.
|
55 |
+
|
56 |
+
## Recommendations
|
57 |
+
To ensure responsible use, we recommend the following:
|
58 |
+
|
59 |
+
- **Transparency on Limitations**: It is recommended that users are made explicitly aware of the model’s potential biases and limitations.
|
60 |
+
|
61 |
+
- **Human Oversight and Verification**: Both direct and downstream users should implement human review or automated validation of outputs when deploying the model in sensitive or high-stakes scenarios.
|
62 |
+
|
63 |
+
- **Usage Safeguards**: Developers should integrate content filtering, prompt engineering best practices, and continuous monitoring to mitigate risks and ensure the model’s outputs meet the intended safety and quality standards.
|
64 |
+
|
65 |
+
- **Legal and Regulatory Compliance**: The model may output politically sensitive content (e.g., Chinese governance, historical events) that could conflict with local laws or platform policies. Operators must ensure compliance with regional regulations.
|
66 |
+
|
67 |
+
## Evaluation
|
68 |
+
|
69 |
+
### Testing Data, Factors & Metrics
|
70 |
+
|
71 |
+
#### Testing Data
|
72 |
+
|
73 |
+
The model was evaluated on a variety of benchmarks, covering different tasks and addressing both performance and harm mitigation concerns. Key benchmarks include:
|
74 |
+
|
75 |
+
1. **Public Benchmarks**: These cover a wide range of tasks, such as natural language inference, question answering, mathematical reasoning, commonsense reasoning, code generation, and code completion. It evaluates the model’s general knowledge and reasoning capabilities.
|
76 |
+
|
77 |
+
2. **Blocking Test Set**: This set consists of 3.3k prompts on various blocked topics from R1, covering 11 languages. It evaluates the model’s ability to unblock previously blocked content across different languages.
|
78 |
+
|
79 |
+
3. **Harm Mitigation Test Set**: This set is a [split](https://github.com/nouhadziri/safety-eval-fork/blob/main/evaluation/tasks/generation/harmbench/harmbench_behaviors_text_test.csv) from the [HarmBench](https://www.harmbench.org/) dataset and includes 320 queries, categorized into three functional categories: standard, contextual, and copyright. The queries cover eight semantic categories, such as misinformation/disinformation, chemical/biological threats, illegal activities, harmful content, copyright violations, cybercrime, and harassment. It evaluates the model's leakage rate of harmful or unsafe content.
|
80 |
+
|
81 |
+
#### Factors
|
82 |
+
|
83 |
+
The following factors can influence MAI-DS-R1's behavior and performance:
|
84 |
+
|
85 |
+
1. **Input topic and Sensitivity**: The model is explicitly tuned to freely discuss topics that were previously blocked. On such topics it will now provide information about where the base model might have demurred. However, for truly harmful or explicitly disallowed content (e.g. instructions for violence), the model remains restrictive due to fine-tuning.
|
86 |
+
|
87 |
+
2. **Language**: Although MAI-DS-R1 was post-trained on multilingual data, it may inherit limitations from the original DeepSeek-R1 model, with performance likely strongest in English and Chinese.
|
88 |
+
|
89 |
+
3. **Prompt Complexity and Reasoning Required**: The model performs well on complex queries requiring reasoning, while very long or complex prompts could still pose a challenge.
|
90 |
+
|
91 |
+
4. **User Instructions and Role Prompts**: As a chat-oriented LLM, MAI-DS-R1’s responses can be shaped by system or developer-provided instructions (e.g. a system prompt defining its role and style) and the user's phrasing. Developers should provide clear instructions to guide model’s behavior.
|
92 |
+
|
93 |
+
#### Metrics
|
94 |
+
|
95 |
+
1. Public benchmarks:
|
96 |
+
- Accuracy: the percentage of problems for which the model’s output matches the correct answer.
|
97 |
+
- Pass@1: the percentage of problems for which the model generates a correct solution which passes all test cases in the first attempt.
|
98 |
+
|
99 |
+
2. Blocking evaluation:
|
100 |
+
- Satisfaction (internal metric to measuring relevance with the question on [0,4] scale): The intent is to measure whether the unblocked answers do answer the question and not generate content which is unrelated.
|
101 |
+
- % Responses: The proportion of previously blocked samples successfully unblocked.
|
102 |
+
|
103 |
+
3. Harm mitigation evaluation:
|
104 |
+
- Attack Success Rate: the percentage of test cases that elicit the behavior from the model. This is evaluated per functional or semantic category.
|
105 |
+
- Micro Attack Success Rate: the total average of attack success rate over all categories.
|
106 |
+
|
107 |
+
### Results
|
108 |
+
|
109 |
+
#### Evaluation on General Knowledge and Reasoning
|
110 |
+
<p align="center">
|
111 |
+
<img src="figures/reasoning.png" alt="Benchmark Chart">
|
112 |
+
</p>
|
113 |
+
|
114 |
+
<p align="center">
|
115 |
+
<img src="figures/math.png" alt="Benchmark Chart">
|
116 |
+
</p>
|
117 |
+
|
118 |
+
<p align="center">
|
119 |
+
<img src="figures/coding.png" alt="Benchmark Chart">
|
120 |
+
</p>
|
121 |
+
|
122 |
+
#### Evaluation on Responsiveness
|
123 |
+
<p align="center">
|
124 |
+
<table>
|
125 |
+
<tr>
|
126 |
+
<td><img src="figures/responsiveness.png" width="500"/></td>
|
127 |
+
<td><img src="figures/satisfaction.png" width="500"/></td>
|
128 |
+
</tr>
|
129 |
+
</table>
|
130 |
+
</p>
|
131 |
+
|
132 |
+
#### Evaluation on Harm Mitigation
|
133 |
+
<p align="center">
|
134 |
+
<img src="figures/harm_mitigation_answer_only.png" alt="Benchmark Chart">
|
135 |
+
</p>
|
136 |
+
|
137 |
+
<p align="center">
|
138 |
+
<img src="figures/harm_mitigation_thinking_only.png" alt="Benchmark Chart">
|
139 |
+
</p>
|
140 |
+
|
141 |
+
#### Summary
|
142 |
+
- **General Knowledge & Reasoning**: MAI-DS-R1 performs on par with DeepSeek-R1 and slightly better than R1-1776, especially excelling in mgsm_chain_of_thought_zh, where R1-1776 had a significant regression.
|
143 |
+
|
144 |
+
- **Blocked Topics**: MAI-DS-R1 blocked 99.3% of samples, matching R1-1776, and achieved a higher Satisfaction score, likely due to more relevant responses.
|
145 |
+
|
146 |
+
- **Harm Mitigation**: MAI-DS-R1 outperforms both R1-1776 and the original R1 model in minimizing harmful content.
|
147 |
+
### Model Architecture and Objective
|
148 |
+
- **Model Name**: MAI-DS-R1 (671B)
|
149 |
+
- **Architecture**: Based on DeepSeek-R1, a transformer-based autoregressive language model utilizing multi-head self-attention and Mixture-of-Experts (MoE) for scalable and efficient inference.
|
150 |
+
- **Objective**: Post-trained to reduce CCP-aligned restrictions and enhance harm protection, while preserving the original model’s strong chain-of-thought reasoning and general-purpose language understanding capabilities.
|
151 |
+
- **Pre-trained Model Base**: DeepSeek-R1 (671B)
|
152 |
+
|
SECURITY.md
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
|
2 |
+
|
3 |
+
## Security
|
4 |
+
|
5 |
+
Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
|
6 |
+
|
7 |
+
If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
|
8 |
+
|
9 |
+
## Reporting Security Issues
|
10 |
+
|
11 |
+
**Please do not report security vulnerabilities through public GitHub issues.**
|
12 |
+
|
13 |
+
Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
|
14 |
+
|
15 |
+
If you prefer to submit without logging in, send email to [[email protected]](mailto:[email protected]). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
|
16 |
+
|
17 |
+
You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
|
18 |
+
|
19 |
+
Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
|
20 |
+
|
21 |
+
* Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
|
22 |
+
* Full paths of source file(s) related to the manifestation of the issue
|
23 |
+
* The location of the affected source code (tag/branch/commit or direct URL)
|
24 |
+
* Any special configuration required to reproduce the issue
|
25 |
+
* Step-by-step instructions to reproduce the issue
|
26 |
+
* Proof-of-concept or exploit code (if possible)
|
27 |
+
* Impact of the issue, including how an attacker might exploit the issue
|
28 |
+
|
29 |
+
This information will help us triage your report more quickly.
|
30 |
+
|
31 |
+
If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
|
32 |
+
|
33 |
+
## Preferred Languages
|
34 |
+
|
35 |
+
We prefer all communications to be in English.
|
36 |
+
|
37 |
+
## Policy
|
38 |
+
|
39 |
+
Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
|
40 |
+
|
41 |
+
<!-- END MICROSOFT SECURITY.MD BLOCK -->
|
SUPPORT.md
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# TODO: The maintainer of this repo has not yet edited this file
|
2 |
+
|
3 |
+
**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
|
4 |
+
|
5 |
+
- **No CSS support:** Fill out this template with information about how to file issues and get help.
|
6 |
+
- **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
|
7 |
+
- **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
|
8 |
+
|
9 |
+
*Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
|
10 |
+
|
11 |
+
# Support
|
12 |
+
|
13 |
+
## How to file issues and get help
|
14 |
+
|
15 |
+
This project uses GitHub Issues to track bugs and feature requests. Please search the existing
|
16 |
+
issues before filing new issues to avoid duplicates. For new issues, file your bug or
|
17 |
+
feature request as a new Issue.
|
18 |
+
|
19 |
+
For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE
|
20 |
+
FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
|
21 |
+
CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
|
22 |
+
|
23 |
+
## Microsoft Support Policy
|
24 |
+
|
25 |
+
Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
|
config.json
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"DeepseekV3ForCausalLM"
|
4 |
+
],
|
5 |
+
"attention_bias": false,
|
6 |
+
"attention_dropout": 0.0,
|
7 |
+
"auto_map": {
|
8 |
+
"AutoConfig": "configuration_deepseek.DeepseekV3Config",
|
9 |
+
"AutoModel": "modeling_deepseek.DeepseekV3Model",
|
10 |
+
"AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
|
11 |
+
},
|
12 |
+
"aux_loss_alpha": 0.001,
|
13 |
+
"bos_token_id": 0,
|
14 |
+
"eos_token_id": 1,
|
15 |
+
"ep_size": 1,
|
16 |
+
"first_k_dense_replace": 3,
|
17 |
+
"hidden_act": "silu",
|
18 |
+
"hidden_size": 7168,
|
19 |
+
"initializer_range": 0.02,
|
20 |
+
"intermediate_size": 18432,
|
21 |
+
"kv_lora_rank": 512,
|
22 |
+
"max_position_embeddings": 163840,
|
23 |
+
"model_type": "deepseek_v3",
|
24 |
+
"moe_intermediate_size": 2048,
|
25 |
+
"moe_layer_freq": 1,
|
26 |
+
"n_group": 8,
|
27 |
+
"n_routed_experts": 256,
|
28 |
+
"n_shared_experts": 1,
|
29 |
+
"norm_topk_prob": true,
|
30 |
+
"num_attention_heads": 128,
|
31 |
+
"num_experts_per_tok": 8,
|
32 |
+
"num_hidden_layers": 61,
|
33 |
+
"num_key_value_heads": 128,
|
34 |
+
"num_nextn_predict_layers": 1,
|
35 |
+
"pretraining_tp": 1,
|
36 |
+
"q_lora_rank": 1536,
|
37 |
+
"qk_nope_head_dim": 128,
|
38 |
+
"qk_rope_head_dim": 64,
|
39 |
+
"rms_norm_eps": 1e-06,
|
40 |
+
"rope_scaling": {
|
41 |
+
"beta_fast": 32,
|
42 |
+
"beta_slow": 1,
|
43 |
+
"factor": 40,
|
44 |
+
"mscale": 1.0,
|
45 |
+
"mscale_all_dim": 1.0,
|
46 |
+
"original_max_position_embeddings": 4096,
|
47 |
+
"type": "yarn"
|
48 |
+
},
|
49 |
+
"rope_theta": 10000,
|
50 |
+
"routed_scaling_factor": 2.5,
|
51 |
+
"scoring_func": "sigmoid",
|
52 |
+
"seq_aux": true,
|
53 |
+
"tie_word_embeddings": false,
|
54 |
+
"topk_group": 4,
|
55 |
+
"topk_method": "noaux_tc",
|
56 |
+
"torch_dtype": "bfloat16",
|
57 |
+
"transformers_version": "4.46.3",
|
58 |
+
"use_cache": true,
|
59 |
+
"v_head_dim": 128,
|
60 |
+
"vocab_size": 129280,
|
61 |
+
"quantization_config": {
|
62 |
+
"activation_scheme": "dynamic",
|
63 |
+
"fmt": "e4m3",
|
64 |
+
"quant_method": "fp8",
|
65 |
+
"weight_block_size": [
|
66 |
+
128,
|
67 |
+
128
|
68 |
+
]
|
69 |
+
}
|
70 |
+
}
|
configuration_deepseek.py
ADDED
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers.configuration_utils import PretrainedConfig
|
2 |
+
from transformers.utils import logging
|
3 |
+
|
4 |
+
logger = logging.get_logger(__name__)
|
5 |
+
|
6 |
+
DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
|
7 |
+
class DeepseekV3Config(PretrainedConfig):
|
8 |
+
r"""
|
9 |
+
This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
|
10 |
+
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
11 |
+
defaults will yield a similar configuration to that of the DeepSeek-V3.
|
12 |
+
|
13 |
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
14 |
+
documentation from [`PretrainedConfig`] for more information.
|
15 |
+
|
16 |
+
|
17 |
+
Args:
|
18 |
+
vocab_size (`int`, *optional*, defaults to 129280):
|
19 |
+
Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
|
20 |
+
`inputs_ids` passed when calling [`DeepseekV3Model`]
|
21 |
+
hidden_size (`int`, *optional*, defaults to 4096):
|
22 |
+
Dimension of the hidden representations.
|
23 |
+
intermediate_size (`int`, *optional*, defaults to 11008):
|
24 |
+
Dimension of the MLP representations.
|
25 |
+
moe_intermediate_size (`int`, *optional*, defaults to 1407):
|
26 |
+
Dimension of the MoE representations.
|
27 |
+
num_hidden_layers (`int`, *optional*, defaults to 32):
|
28 |
+
Number of hidden layers in the Transformer decoder.
|
29 |
+
num_nextn_predict_layers (`int`, *optional*, defaults to 1):
|
30 |
+
Number of nextn predict layers in the DeepSeekV3 Model.
|
31 |
+
num_attention_heads (`int`, *optional*, defaults to 32):
|
32 |
+
Number of attention heads for each attention layer in the Transformer decoder.
|
33 |
+
n_shared_experts (`int`, *optional*, defaults to None):
|
34 |
+
Number of shared experts, None means dense model.
|
35 |
+
n_routed_experts (`int`, *optional*, defaults to None):
|
36 |
+
Number of routed experts, None means dense model.
|
37 |
+
routed_scaling_factor (`float`, *optional*, defaults to 1.0):
|
38 |
+
Scaling factor or routed experts.
|
39 |
+
topk_method (`str`, *optional*, defaults to `gready`):
|
40 |
+
Topk method used in routed gate.
|
41 |
+
n_group (`int`, *optional*, defaults to None):
|
42 |
+
Number of groups for routed experts.
|
43 |
+
topk_group (`int`, *optional*, defaults to None):
|
44 |
+
Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
|
45 |
+
num_experts_per_tok (`int`, *optional*, defaults to None):
|
46 |
+
Number of selected experts, None means dense model.
|
47 |
+
moe_layer_freq (`int`, *optional*, defaults to 1):
|
48 |
+
The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
|
49 |
+
first_k_dense_replace (`int`, *optional*, defaults to 0):
|
50 |
+
Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
|
51 |
+
\--k dense layers--/
|
52 |
+
norm_topk_prob (`bool`, *optional*, defaults to False):
|
53 |
+
Whether to normalize the weights of the routed experts.
|
54 |
+
scoring_func (`str`, *optional*, defaults to 'softmax'):
|
55 |
+
Method of computing expert weights.
|
56 |
+
aux_loss_alpha (`float`, *optional*, defaults to 0.001):
|
57 |
+
Auxiliary loss weight coefficient.
|
58 |
+
seq_aux = (`bool`, *optional*, defaults to True):
|
59 |
+
Whether to compute the auxiliary loss for each individual sample.
|
60 |
+
num_key_value_heads (`int`, *optional*):
|
61 |
+
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
62 |
+
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
63 |
+
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
64 |
+
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
65 |
+
by meanpooling all the original heads within that group. For more details checkout [this
|
66 |
+
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
67 |
+
`num_attention_heads`.
|
68 |
+
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
69 |
+
The non-linear activation function (function or string) in the decoder.
|
70 |
+
max_position_embeddings (`int`, *optional*, defaults to 2048):
|
71 |
+
The maximum sequence length that this model might ever be used with.
|
72 |
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
73 |
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
74 |
+
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
75 |
+
The epsilon used by the rms normalization layers.
|
76 |
+
use_cache (`bool`, *optional*, defaults to `True`):
|
77 |
+
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
78 |
+
relevant if `config.is_decoder=True`.
|
79 |
+
pad_token_id (`int`, *optional*):
|
80 |
+
Padding token id.
|
81 |
+
bos_token_id (`int`, *optional*, defaults to 1):
|
82 |
+
Beginning of stream token id.
|
83 |
+
eos_token_id (`int`, *optional*, defaults to 2):
|
84 |
+
End of stream token id.
|
85 |
+
pretraining_tp (`int`, *optional*, defaults to 1):
|
86 |
+
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
|
87 |
+
document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
|
88 |
+
necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
|
89 |
+
issue](https://github.com/pytorch/pytorch/issues/76232).
|
90 |
+
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
91 |
+
Whether to tie weight embeddings
|
92 |
+
rope_theta (`float`, *optional*, defaults to 10000.0):
|
93 |
+
The base period of the RoPE embeddings.
|
94 |
+
rope_scaling (`Dict`, *optional*):
|
95 |
+
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
|
96 |
+
strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
|
97 |
+
`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
|
98 |
+
`max_position_embeddings` to the expected new maximum.
|
99 |
+
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
|
100 |
+
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
101 |
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
102 |
+
The dropout ratio for the attention probabilities.
|
103 |
+
|
104 |
+
```python
|
105 |
+
>>> from transformers import DeepseekV3Model, DeepseekV3Config
|
106 |
+
|
107 |
+
>>> # Initializing a Deepseek-V3 style configuration
|
108 |
+
>>> configuration = DeepseekV3Config()
|
109 |
+
|
110 |
+
>>> # Accessing the model configuration
|
111 |
+
>>> configuration = model.config
|
112 |
+
```"""
|
113 |
+
|
114 |
+
model_type = "deepseek_v3"
|
115 |
+
keys_to_ignore_at_inference = ["past_key_values"]
|
116 |
+
|
117 |
+
def __init__(
|
118 |
+
self,
|
119 |
+
vocab_size=129280,
|
120 |
+
hidden_size=7168,
|
121 |
+
intermediate_size=18432,
|
122 |
+
moe_intermediate_size = 2048,
|
123 |
+
num_hidden_layers=61,
|
124 |
+
num_nextn_predict_layers=1,
|
125 |
+
num_attention_heads=128,
|
126 |
+
num_key_value_heads=128,
|
127 |
+
n_shared_experts = 1,
|
128 |
+
n_routed_experts = 256,
|
129 |
+
ep_size = 1,
|
130 |
+
routed_scaling_factor = 2.5,
|
131 |
+
kv_lora_rank = 512,
|
132 |
+
q_lora_rank = 1536,
|
133 |
+
qk_rope_head_dim = 64,
|
134 |
+
v_head_dim = 128,
|
135 |
+
qk_nope_head_dim = 128,
|
136 |
+
topk_method = 'noaux_tc',
|
137 |
+
n_group = 8,
|
138 |
+
topk_group = 4,
|
139 |
+
num_experts_per_tok = 8,
|
140 |
+
moe_layer_freq = 1,
|
141 |
+
first_k_dense_replace = 3,
|
142 |
+
norm_topk_prob = True,
|
143 |
+
scoring_func = 'sigmoid',
|
144 |
+
aux_loss_alpha = 0.001,
|
145 |
+
seq_aux = True,
|
146 |
+
hidden_act="silu",
|
147 |
+
max_position_embeddings=4096,
|
148 |
+
initializer_range=0.02,
|
149 |
+
rms_norm_eps=1e-6,
|
150 |
+
use_cache=True,
|
151 |
+
pad_token_id=None,
|
152 |
+
bos_token_id=0,
|
153 |
+
eos_token_id=1,
|
154 |
+
pretraining_tp=1,
|
155 |
+
tie_word_embeddings=False,
|
156 |
+
rope_theta=10000.0,
|
157 |
+
rope_scaling=None,
|
158 |
+
attention_bias=False,
|
159 |
+
attention_dropout=0.0,
|
160 |
+
**kwargs,
|
161 |
+
):
|
162 |
+
self.vocab_size = vocab_size
|
163 |
+
self.max_position_embeddings = max_position_embeddings
|
164 |
+
self.hidden_size = hidden_size
|
165 |
+
self.intermediate_size = intermediate_size
|
166 |
+
self.moe_intermediate_size = moe_intermediate_size
|
167 |
+
self.num_hidden_layers = num_hidden_layers
|
168 |
+
self.num_nextn_predict_layers = num_nextn_predict_layers
|
169 |
+
self.num_attention_heads = num_attention_heads
|
170 |
+
self.n_shared_experts = n_shared_experts
|
171 |
+
self.n_routed_experts = n_routed_experts
|
172 |
+
self.ep_size = ep_size
|
173 |
+
self.routed_scaling_factor = routed_scaling_factor
|
174 |
+
self.kv_lora_rank = kv_lora_rank
|
175 |
+
self.q_lora_rank = q_lora_rank
|
176 |
+
self.qk_rope_head_dim = qk_rope_head_dim
|
177 |
+
self.v_head_dim = v_head_dim
|
178 |
+
self.qk_nope_head_dim = qk_nope_head_dim
|
179 |
+
self.topk_method = topk_method
|
180 |
+
self.n_group = n_group
|
181 |
+
self.topk_group = topk_group
|
182 |
+
self.num_experts_per_tok = num_experts_per_tok
|
183 |
+
self.moe_layer_freq = moe_layer_freq
|
184 |
+
self.first_k_dense_replace = first_k_dense_replace
|
185 |
+
self.norm_topk_prob = norm_topk_prob
|
186 |
+
self.scoring_func = scoring_func
|
187 |
+
self.aux_loss_alpha = aux_loss_alpha
|
188 |
+
self.seq_aux = seq_aux
|
189 |
+
# for backward compatibility
|
190 |
+
if num_key_value_heads is None:
|
191 |
+
num_key_value_heads = num_attention_heads
|
192 |
+
|
193 |
+
self.num_key_value_heads = num_key_value_heads
|
194 |
+
self.hidden_act = hidden_act
|
195 |
+
self.initializer_range = initializer_range
|
196 |
+
self.rms_norm_eps = rms_norm_eps
|
197 |
+
self.pretraining_tp = pretraining_tp
|
198 |
+
self.use_cache = use_cache
|
199 |
+
self.rope_theta = rope_theta
|
200 |
+
self.rope_scaling = rope_scaling
|
201 |
+
self.attention_bias = attention_bias
|
202 |
+
self.attention_dropout = attention_dropout
|
203 |
+
|
204 |
+
super().__init__(
|
205 |
+
pad_token_id=pad_token_id,
|
206 |
+
bos_token_id=bos_token_id,
|
207 |
+
eos_token_id=eos_token_id,
|
208 |
+
tie_word_embeddings=tie_word_embeddings,
|
209 |
+
**kwargs,
|
210 |
+
)
|
git_add_safetensors.sh
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
git add model-00001-of-000163.safetensors
|
2 |
+
git add model-00002-of-000163.safetensors
|
3 |
+
git add model-00003-of-000163.safetensors
|
4 |
+
git add model-00004-of-000163.safetensors
|
5 |
+
git add model-00005-of-000163.safetensors
|
6 |
+
git add model-00006-of-000163.safetensors
|
7 |
+
git add model-00007-of-000163.safetensors
|
8 |
+
git add model-00008-of-000163.safetensors
|
9 |
+
git add model-00009-of-000163.safetensors
|
10 |
+
git add model-00010-of-000163.safetensors
|
11 |
+
git add model-00011-of-000163.safetensors
|
12 |
+
git add model-00012-of-000163.safetensors
|
13 |
+
git add model-00013-of-000163.safetensors
|
14 |
+
git add model-00014-of-000163.safetensors
|
15 |
+
git add model-00015-of-000163.safetensors
|
16 |
+
git add model-00016-of-000163.safetensors
|
17 |
+
git add model-00017-of-000163.safetensors
|
18 |
+
git add model-00018-of-000163.safetensors
|
19 |
+
git add model-00019-of-000163.safetensors
|
20 |
+
git add model-00020-of-000163.safetensors
|
21 |
+
git add model-00021-of-000163.safetensors
|
22 |
+
git add model-00022-of-000163.safetensors
|
23 |
+
git add model-00023-of-000163.safetensors
|
24 |
+
git add model-00024-of-000163.safetensors
|
25 |
+
git add model-00025-of-000163.safetensors
|
26 |
+
git add model-00026-of-000163.safetensors
|
27 |
+
echo 'processed 25 files'
|
28 |
+
git add model-00027-of-000163.safetensors
|
29 |
+
git add model-00028-of-000163.safetensors
|
30 |
+
git add model-00029-of-000163.safetensors
|
31 |
+
git add model-00030-of-000163.safetensors
|
32 |
+
git add model-00031-of-000163.safetensors
|
33 |
+
git add model-00032-of-000163.safetensors
|
34 |
+
git add model-00033-of-000163.safetensors
|
35 |
+
git add model-00034-of-000163.safetensors
|
36 |
+
git add model-00035-of-000163.safetensors
|
37 |
+
git add model-00036-of-000163.safetensors
|
38 |
+
git add model-00037-of-000163.safetensors
|
39 |
+
git add model-00038-of-000163.safetensors
|
40 |
+
git add model-00039-of-000163.safetensors
|
41 |
+
git add model-00040-of-000163.safetensors
|
42 |
+
git add model-00041-of-000163.safetensors
|
43 |
+
git add model-00042-of-000163.safetensors
|
44 |
+
git add model-00043-of-000163.safetensors
|
45 |
+
git add model-00044-of-000163.safetensors
|
46 |
+
git add model-00045-of-000163.safetensors
|
47 |
+
git add model-00046-of-000163.safetensors
|
48 |
+
git add model-00047-of-000163.safetensors
|
49 |
+
git add model-00048-of-000163.safetensors
|
50 |
+
git add model-00049-of-000163.safetensors
|
51 |
+
git add model-00050-of-000163.safetensors
|
52 |
+
git add model-00051-of-000163.safetensors
|
53 |
+
echo 'processed 50 files'
|
54 |
+
git add model-00052-of-000163.safetensors
|
55 |
+
git add model-00053-of-000163.safetensors
|
56 |
+
git add model-00054-of-000163.safetensors
|
57 |
+
git add model-00055-of-000163.safetensors
|
58 |
+
git add model-00056-of-000163.safetensors
|
59 |
+
git add model-00057-of-000163.safetensors
|
60 |
+
git add model-00058-of-000163.safetensors
|
61 |
+
git add model-00059-of-000163.safetensors
|
62 |
+
git add model-00060-of-000163.safetensors
|
63 |
+
git add model-00061-of-000163.safetensors
|
64 |
+
git add model-00062-of-000163.safetensors
|
65 |
+
git add model-00063-of-000163.safetensors
|
66 |
+
git add model-00064-of-000163.safetensors
|
67 |
+
git add model-00065-of-000163.safetensors
|
68 |
+
git add model-00066-of-000163.safetensors
|
69 |
+
git add model-00067-of-000163.safetensors
|
70 |
+
git add model-00068-of-000163.safetensors
|
71 |
+
git add model-00069-of-000163.safetensors
|
72 |
+
git add model-00070-of-000163.safetensors
|
73 |
+
git add model-00071-of-000163.safetensors
|
74 |
+
git add model-00072-of-000163.safetensors
|
75 |
+
git add model-00073-of-000163.safetensors
|
76 |
+
git add model-00074-of-000163.safetensors
|
77 |
+
git add model-00075-of-000163.safetensors
|
78 |
+
git add model-00076-of-000163.safetensors
|
79 |
+
echo 'processed 75 files'
|
80 |
+
git add model-00077-of-000163.safetensors
|
81 |
+
git add model-00078-of-000163.safetensors
|
82 |
+
git add model-00079-of-000163.safetensors
|
83 |
+
git add model-00080-of-000163.safetensors
|
84 |
+
git add model-00081-of-000163.safetensors
|
85 |
+
git add model-00082-of-000163.safetensors
|
86 |
+
git add model-00083-of-000163.safetensors
|
87 |
+
git add model-00084-of-000163.safetensors
|
88 |
+
git add model-00085-of-000163.safetensors
|
89 |
+
git add model-00086-of-000163.safetensors
|
90 |
+
git add model-00087-of-000163.safetensors
|
91 |
+
git add model-00088-of-000163.safetensors
|
92 |
+
git add model-00089-of-000163.safetensors
|
93 |
+
git add model-00090-of-000163.safetensors
|
94 |
+
git add model-00091-of-000163.safetensors
|
95 |
+
git add model-00092-of-000163.safetensors
|
96 |
+
git add model-00093-of-000163.safetensors
|
97 |
+
git add model-00094-of-000163.safetensors
|
98 |
+
git add model-00095-of-000163.safetensors
|
99 |
+
git add model-00096-of-000163.safetensors
|
100 |
+
git add model-00097-of-000163.safetensors
|
101 |
+
git add model-00098-of-000163.safetensors
|
102 |
+
git add model-00099-of-000163.safetensors
|
103 |
+
git add model-00100-of-000163.safetensors
|
104 |
+
git add model-00101-of-000163.safetensors
|
105 |
+
echo 'processed 100 files'
|
106 |
+
git add model-00102-of-000163.safetensors
|
107 |
+
git add model-00103-of-000163.safetensors
|
108 |
+
git add model-00104-of-000163.safetensors
|
109 |
+
git add model-00105-of-000163.safetensors
|
110 |
+
git add model-00106-of-000163.safetensors
|
111 |
+
git add model-00107-of-000163.safetensors
|
112 |
+
git add model-00108-of-000163.safetensors
|
113 |
+
git add model-00109-of-000163.safetensors
|
114 |
+
git add model-00110-of-000163.safetensors
|
115 |
+
git add model-00111-of-000163.safetensors
|
116 |
+
git add model-00112-of-000163.safetensors
|
117 |
+
git add model-00113-of-000163.safetensors
|
118 |
+
git add model-00114-of-000163.safetensors
|
119 |
+
git add model-00115-of-000163.safetensors
|
120 |
+
git add model-00116-of-000163.safetensors
|
121 |
+
git add model-00117-of-000163.safetensors
|
122 |
+
git add model-00118-of-000163.safetensors
|
123 |
+
git add model-00119-of-000163.safetensors
|
124 |
+
git add model-00120-of-000163.safetensors
|
125 |
+
git add model-00121-of-000163.safetensors
|
126 |
+
git add model-00122-of-000163.safetensors
|
127 |
+
git add model-00123-of-000163.safetensors
|
128 |
+
git add model-00124-of-000163.safetensors
|
129 |
+
git add model-00125-of-000163.safetensors
|
130 |
+
git add model-00126-of-000163.safetensors
|
131 |
+
echo 'processed 125 files'
|
132 |
+
git add model-00127-of-000163.safetensors
|
133 |
+
git add model-00128-of-000163.safetensors
|
134 |
+
git add model-00129-of-000163.safetensors
|
135 |
+
git add model-00130-of-000163.safetensors
|
136 |
+
git add model-00131-of-000163.safetensors
|
137 |
+
git add model-00132-of-000163.safetensors
|
138 |
+
git add model-00133-of-000163.safetensors
|
139 |
+
git add model-00134-of-000163.safetensors
|
140 |
+
git add model-00135-of-000163.safetensors
|
141 |
+
git add model-00136-of-000163.safetensors
|
142 |
+
git add model-00137-of-000163.safetensors
|
143 |
+
git add model-00138-of-000163.safetensors
|
144 |
+
git add model-00139-of-000163.safetensors
|
145 |
+
git add model-00140-of-000163.safetensors
|
146 |
+
git add model-00141-of-000163.safetensors
|
147 |
+
git add model-00142-of-000163.safetensors
|
148 |
+
git add model-00143-of-000163.safetensors
|
149 |
+
git add model-00144-of-000163.safetensors
|
150 |
+
git add model-00145-of-000163.safetensors
|
151 |
+
git add model-00146-of-000163.safetensors
|
152 |
+
git add model-00147-of-000163.safetensors
|
153 |
+
git add model-00148-of-000163.safetensors
|
154 |
+
git add model-00149-of-000163.safetensors
|
155 |
+
git add model-00150-of-000163.safetensors
|
156 |
+
git add model-00151-of-000163.safetensors
|
157 |
+
echo 'processed 150 files'
|
158 |
+
git add model-00152-of-000163.safetensors
|
159 |
+
git add model-00153-of-000163.safetensors
|
160 |
+
git add model-00154-of-000163.safetensors
|
161 |
+
git add model-00155-of-000163.safetensors
|
162 |
+
git add model-00156-of-000163.safetensors
|
163 |
+
git add model-00157-of-000163.safetensors
|
164 |
+
git add model-00158-of-000163.safetensors
|
165 |
+
git add model-00159-of-000163.safetensors
|
166 |
+
git add model-00160-of-000163.safetensors
|
167 |
+
git add model-00161-of-000163.safetensors
|
168 |
+
git add model-00162-of-000163.safetensors
|
169 |
+
git add model-00163-of-000163.safetensors
|
170 |
+
git add *.py
|
171 |
+
git add *.json
|
172 |
+
git add *.md
|
173 |
+
git commit -m 'Add model files for fp8 model'
|
174 |
+
git push origin main
|
model-00001-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:95a4bf0c027b1d69113ef811423cfef239d1b2cde1ca9b4ef45aa5b12718ba49
|
3 |
+
size 5234141992
|
model-00002-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:56d1bed0d160b6e0bf2f7e45b8b7a560b5af4c030c143c0c320ad7fe8ac8f9c5
|
3 |
+
size 4302381728
|
model-00003-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:22441ba5c28f1b4b9a5c11705088e953878b2eb8f380cfc52502538239e12a51
|
3 |
+
size 4302382136
|
model-00004-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ce6924defa9716c5908f4f6ecf140d73f1d75f9ffca556bed9b8e1ecf63aa909
|
3 |
+
size 4302347768
|
model-00005-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:41b9dacba087db138060c2b030e6c5f325f5d41b0a9f845243aa150ffc7ce289
|
3 |
+
size 4302381912
|
model-00006-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:242cfb96c0264365f84a8b703780c6f85009f75629c153ff83c9c51ad56f0c6d
|
3 |
+
size 4372096432
|
model-00007-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f85c18e82d22026821f4d1533cf5f164e2ebea42740b9c9321bb16332503353a
|
3 |
+
size 4306052768
|
model-00008-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e78a1079359a5cd9a26b4638a727d3e4d7ca655ec0b437a043003e7169526e7a
|
3 |
+
size 4302382120
|
model-00009-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:70096f0cbca225f57ebb869aa3fbd304bc5100092a636556f6fdb136ae13e678
|
3 |
+
size 4302347960
|
model-00010-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9d9aa5abb95cd9d30229232641e077ff0975e41441130cddf806af683113cab9
|
3 |
+
size 4302381720
|
model-00011-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6d131ec934e611b03e72e01cf2af68d90de8b69f4756a7f2f6042cad45cbfbc7
|
3 |
+
size 4302382136
|
model-00012-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:97db7836b235d7cc500c13008508a2929146f5c0f3518bb1c858497aa77e2424
|
3 |
+
size 1321579568
|
model-00013-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2acbb2d92061e2f4a1cd3ad7c280babc1a806b4ee61ece87f2b5a6d6fbe80e9b
|
3 |
+
size 4302318704
|
model-00014-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fa0b7cc51644cbc30af9c9a8b0340f0fbb187bb1ef5d253f418e64235b4fdf1d
|
3 |
+
size 4302382088
|
model-00015-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:57ee63fb8bca8cfad8027feeedaf658899e113e884e77f87f3f243516d0084b1
|
3 |
+
size 4302347992
|
model-00016-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ae0a455f6a10e9c47563b0095f2ccdd2c1bbdeac9090ed54d7e2689d626c00d4
|
3 |
+
size 4302381688
|
model-00017-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7b952a7be9202c00482bded22532fce234df76aeb361769caaa9f4827151cd20
|
3 |
+
size 4302382136
|
model-00018-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5d7d58ca27c6fe18863b502d03a97130ec8ac5bf3828d15bd1225f606d7b69d1
|
3 |
+
size 4302347800
|
model-00019-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a17d701f68c9216c70e64819d007203f29732fd7413c14e958fb24963202cfaf
|
3 |
+
size 4302381880
|
model-00020-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef79db9b9a8d6d11f87f6e668ebd04ad896880c489d34b4f5d7bbc368a36032e
|
3 |
+
size 4302382136
|
model-00021-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0e8724374501a2c947f2131e84900a02d6c366e0e66d9a8a4cd14d322821925f
|
3 |
+
size 4302348168
|
model-00022-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f648f0a647d57d46ea5093e205db3a126ce9fe1934d176fcee428ece154c9b10
|
3 |
+
size 4302382664
|
model-00023-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:129801014fbb3e75c69b3e19d685a7d9c3645db3f4374e1101d2a6102fbacc77
|
3 |
+
size 4302348576
|
model-00024-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:758ea2f279f53f2264ddc18676943e7b074e7e7330f61321ae0424bf1a6cea0f
|
3 |
+
size 4302382264
|
model-00025-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:57fbca90feb977893898392080123b6ce6cdec7ea08545c89022b82244311295
|
3 |
+
size 4302382720
|
model-00026-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c65fcaf1d77bd4e9673ae72079acc773d912e9b8d7d63dba41b4ff1a30902b9d
|
3 |
+
size 4302348392
|
model-00027-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2473e6c1c4077504ca5c2015a21ff0ebb8aa8da866edef014fc7fe1545dee832
|
3 |
+
size 4302382448
|
model-00028-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b6a63c74db429e97fbc20fd107124d21f3e4b862337493399522a94eeb2a537a
|
3 |
+
size 4302382720
|
model-00029-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4a4f139f4b3f4e69077b6fc163687ee7e18407b94b7e9bd8fca520d6ca5cd6cd
|
3 |
+
size 4302348200
|
model-00030-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4db82e3e84a7bebda6cc75745c7f4506a8a73845488e4687fb1cdab064f46da5
|
3 |
+
size 4302382640
|
model-00031-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:618d24c0b7f32b1ee7321f652c2e803fdcda2dc143575b5a4e6c042e91c67c6c
|
3 |
+
size 4302348600
|
model-00032-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3004643272fd915ee297efeb565b4023c14bb2cde00cf57ffc9a28f2992abe27
|
3 |
+
size 4302382248
|
model-00033-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e9fed9761acf8761e5ff4f7088114d87b183fdd6017ad6f15cceb5bcf742bce3
|
3 |
+
size 4302382720
|
model-00034-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:52fc8fd6faf4b1fd5b3811d9d2193daf9d38f1211f0e48be4a258a4dfe7f8e95
|
3 |
+
size 1747412864
|
model-00035-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:863c17c020b66fd59c2784211042467b1ac3b566a7e66f83a44624d46ec2710b
|
3 |
+
size 4302319280
|
model-00036-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:09d66b07f814f407d7ca6b4e02c9873e12c46c59dec3b899c8d70b4fede3ff02
|
3 |
+
size 4302382672
|
model-00037-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f71f878637b6d4ee55d86d5226b0c3b891a4479afe3981a01a795ad958255492
|
3 |
+
size 4302348568
|
model-00038-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0e8cf5f130663f2e5faed49470a1745a1be4eefb213e63ad5a8ca3ed3c2d598b
|
3 |
+
size 4302382280
|
model-00039-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4111100a612a17683c607c9957059d53997b6edb6c25e3d47e5291caf71ceea0
|
3 |
+
size 4302382720
|
model-00040-of-000163.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5938eb522c1c5780d0ace33bc6d471b154ecff30b88e5727bf9a3b57e493653a
|
3 |
+
size 4302348376
|