文章目录  CodeGeex2 ChatGLM2_6B Baichuan2_13B sqlcoder 开启后测试   
 
from  fastapi import  FastAPI,  Request
from  transformers import  AutoTokenizer,  AutoModel
import  uvicorn,  json,  datetime
import  torch
import  argparse
try : import  chatglm_cppenable_chatglm_cpp =  True 
except : print ( "[WARN] chatglm-cpp not found. Install it by `pip install chatglm-cpp` for better performance. " "Check out https://github.com/li-plus/chatglm.cpp for more details." ) enable_chatglm_cpp =  False LANGUAGE_TAG =  { "Abap"          :  "* language: Abap" , "ActionScript"  :  "// language: ActionScript" , "Ada"           :  "-- language: Ada" , "Agda"          :  "-- language: Agda" , "ANTLR"         :  "// language: ANTLR" , "AppleScript"   :  "-- language: AppleScript" , "Assembly"      :  "; language: Assembly" , "Augeas"        :  "// language: Augeas" , "AWK"           :  "// language: AWK" , "Basic"         :  "' language: Basic" , "C"             :  "// language: C" , "C#"            :  "// language: C#" , "C++"           :  "// language: C++" , "CMake"         :  "# language: CMake" , "Cobol"         :  "// language: Cobol" , "CSS"           :  "/* language: CSS */" , "CUDA"          :  "// language: Cuda" , "Dart"          :  "// language: Dart" , "Delphi"        :  "{language: Delphi}" , "Dockerfile"    :  "# language: Dockerfile" , "Elixir"        :  "# language: Elixir" , "Erlang"        :  f"% language: Erlang" , "Excel"         :  "' language: Excel" , "F#"            :  "// language: F#" , "Fortran"       :  "!language: Fortran" , "GDScript"      :  "# language: GDScript" , "GLSL"          :  "// language: GLSL" , "Go"            :  "// language: Go" , "Groovy"        :  "// language: Groovy" , "Haskell"       :  "-- language: Haskell" , "HTML"          :  "<!--language: HTML-->" , "Isabelle"      :  "(*language: Isabelle*)" , "Java"          :  "// language: Java" , "JavaScript"    :  "// language: JavaScript" , "Julia"         :  "# language: Julia" , "Kotlin"        :  "// language: Kotlin" , "Lean"          :  "-- language: Lean" , "Lisp"          :  "; language: Lisp" , "Lua"           :  "// language: Lua" , "Markdown"      :  "<!--language: Markdown-->" , "Matlab"        :  f"% language: Matlab" , "Objective-C"   :  "// language: Objective-C" , "Objective-C++" :  "// language: Objective-C++" , "Pascal"        :  "// language: Pascal" , "Perl"          :  "# language: Perl" , "PHP"           :  "// language: PHP" , "PowerShell"    :  "# language: PowerShell" , "Prolog"        :  f"% language: Prolog" , "Python"        :  "# language: Python" , "R"             :  "# language: R" , "Racket"        :  "; language: Racket" , "RMarkdown"     :  "# language: RMarkdown" , "Ruby"          :  "# language: Ruby" , "Rust"          :  "// language: Rust" , "Scala"         :  "// language: Scala" , "Scheme"        :  "; language: Scheme" , "Shell"         :  "# language: Shell" , "Solidity"      :  "// language: Solidity" , "SPARQL"        :  "# language: SPARQL" , "SQL"           :  "-- language: SQL" , "Swift"         :  "// language: swift" , "TeX"           :  f"% language: TeX" , "Thrift"        :  "/* language: Thrift */" , "TypeScript"    :  "// language: TypeScript" , "Vue"           :  "<!--language: Vue-->" , "Verilog"       :  "// language: Verilog" , "Visual Basic"  :  "' language: Visual Basic" , 
} app =  FastAPI( ) 
def  device ( config,  model_path) : if  enable_chatglm_cpp and  config. use_chatglm_cpp: print ( "Using chatglm-cpp to improve performance" ) dtype =  "f16"  if  config. half else  "f32" if  config. quantize in  [ 4 ,  5 ,  8 ] : dtype =  f"q { config. quantize} _0" =  chatglm_cpp. Pipeline( model_path,  dtype= dtype) return  modelprint ( "chatglm-cpp not enabled, falling back to transformers" ) if  config. device !=  "cpu" : if  not  config. half: model =  AutoModel. from_pretrained( model_path,  trust_remote_code= True ) . cuda( int ( config. device) ) else : model =  AutoModel. from_pretrained( model_path,  trust_remote_code= True ) . cuda( int ( config. device) ) . half( ) if  config. quantize in  [ 4 ,  8 ] : print ( f"Model is quantized to INT { config. quantize}  format." ) model =  model. half( ) . quantize( config. quantize) else : model =  AutoModel. from_pretrained( model_path,  trust_remote_code= True ) return  model. eval ( ) @app. post ( "/" ) 
async  def  create_item ( request:  Request) : global  model,  tokenizerjson_post_raw =  await  request. json( ) json_post =  json. dumps( json_post_raw) json_post_list =  json. loads( json_post) lang =  json_post_list. get( 'lang' ) prompt =  json_post_list. get( 'prompt' ) max_length =  json_post_list. get( 'max_length' ,  128 ) top_p =  json_post_list. get( 'top_p' ,  0.95 ) temperature =  json_post_list. get( 'temperature' ,  0.2 ) top_k =  json_post_list. get( 'top_k' ,  0 ) if  lang !=  "None" : prompt =  LANGUAGE_TAG[ lang]  +  "\n"  +  promptif  enable_chatglm_cpp and  use_chatglm_cpp: response =  model. generate( prompt, max_length= max_length, do_sample= temperature >  0 , top_p= top_p, top_k= top_k, temperature= temperature) else : response =  model. chat( tokenizer, prompt, max_length= max_length, top_p= top_p, top_k= top_k, temperature= temperature) now =  datetime. datetime. now( ) time =  now. strftime( "%Y-%m-%d %H:%M:%S" ) answer =  { "response" :  response, "lang" :  lang, "status" :  200 , "time" :  time} return  answerdef  api_start ( config) : global  use_chatglm_cppuse_chatglm_cpp =  config. use_chatglm_cppmodel_path =  "CodeModels/CodeGeex2" global  tokenizerglobal  modeltokenizer =  AutoTokenizer. from_pretrained( model_path,  trust_remote_code= True ) model =  device( config,  model_path) uvicorn. run( app,  host= "0.0.0.0" ,  port= 7861 ,  workers= 1 ) from  fastapi import  FastAPI,  Request
from  transformers import  AutoTokenizer,  AutoModel
import  uvicorn,  json,  datetime
import  torchdef  torch_gc ( mydevice) : if  torch. cuda. is_available( ) : with  torch. cuda. device( mydevice) : torch. cuda. empty_cache( ) torch. cuda. ipc_collect( ) app =  FastAPI( ) 
def  device ( config,  model_path) : if  config. device !=  "cpu" : if  not  config. half: model =  AutoModel. from_pretrained( model_path,  trust_remote_code= True ) . cuda( int ( config. device) ) else : model =  AutoModel. from_pretrained( model_path,  trust_remote_code= True ) . cuda( int ( config. device) ) . half( ) if  config. quantize in  [ 4 ,  8 ] : print ( f"Model is quantized to INT { config. quantize}  format." ) model =  model. half( ) . quantize( config. quantize) else : model =  AutoModel. from_pretrained( model_path,  trust_remote_code= True ) return  model. eval ( ) @app. post ( "/" ) 
async  def  create_item ( request:  Request) : global  model,  tokenizerjson_post_raw =  await  request. json( ) json_post =  json. dumps( json_post_raw) json_post_list =  json. loads( json_post) prompt =  json_post_list. get( 'prompt' ) history =  json_post_list. get( 'history' ,  [ ] ) max_length =  json_post_list. get( 'max_length' ,  2048 ) top_p =  json_post_list. get( 'top_p' ,  0.7 ) temperature =  json_post_list. get( 'temperature' ,  0.95 ) top_k =  json_post_list. get( 'top_k' ,  0 ) response,  history =  model. chat( tokenizer, prompt, history= history, max_length= max_length, top_p= top_p, temperature= temperature) now =  datetime. datetime. now( ) time =  now. strftime( "%Y-%m-%d %H:%M:%S" ) answer =  { "response" :  response, "history" :  history, "status" :  200 , "time" :  time} torch_gc( model. device) return  answerdef  api_start ( config) : model_path =  "LanguageModels/ChatGLM2_6B/" global  tokenizerglobal  modeltokenizer =  AutoTokenizer. from_pretrained( model_path,  trust_remote_code= True ) model =  device( config,  model_path) uvicorn. run( app,  host= "0.0.0.0" ,  port= 7862 ,  workers= 1 ) from  fastapi import  FastAPI,  Request
from  transformers import  AutoTokenizer,  AutoModelForCausalLM
from  transformers. generation. utils import  GenerationConfig
import  uvicorn,  json,  datetime
import  torchdef  torch_gc ( mydevice) : if  torch. cuda. is_available( ) : with  torch. cuda. device( mydevice) : torch. cuda. empty_cache( ) torch. cuda. ipc_collect( ) app =  FastAPI( ) 
def  device ( config,  model_path) : model =  AutoModelForCausalLM. from_pretrained( model_path,  device_map= "auto" ,  torch_dtype= torch. bfloat16,  trust_remote_code= True ) model. generation_config =  GenerationConfig. from_pretrained( model_path) return  model. eval ( ) @app. post ( "/" ) 
async  def  create_item ( request:  Request) : global  model,  tokenizerjson_post_raw =  await  request. json( ) json_post =  json. dumps( json_post_raw) json_post_list =  json. loads( json_post) prompt =  json_post_list. get( 'prompt' ) messages =  [ ] messages. append( { "role" :  "user" ,  "content" :  prompt} ) response =  model. chat( tokenizer,  messages) now =  datetime. datetime. now( ) time =  now. strftime( "%Y-%m-%d %H:%M:%S" ) answer =  { "response" :  response, "status" :  200 , "time" :  time} torch_gc( model. device) return  answerdef  api_start ( config) : model_path =  "LanguageModels/Baichuan2_13B_Chat/" global  tokenizerglobal  modeltokenizer =  AutoTokenizer. from_pretrained( model_path,  use_fast= False ,  trust_remote_code= True ) model =  device( config,  model_path) uvicorn. run( app,  host= "0.0.0.0" ,  port= 7863 ,  workers= 1 ) 
from  fastapi import  FastAPI,  Request
from  transformers import  AutoTokenizer,  AutoModelForCausalLM
from  transformers. generation. utils import  GenerationConfig
import  uvicorn,  json,  datetime
import  torchdef  torch_gc ( mydevice) : if  torch. cuda. is_available( ) : with  torch. cuda. device( mydevice) : torch. cuda. empty_cache( ) torch. cuda. ipc_collect( ) app =  FastAPI( ) 
def  device ( config,  model_path) : model =  AutoModelForCausalLM. from_pretrained( model_path,  device_map= "auto" ,  load_in_8bit= True ,  use_cache= True ,  trust_remote_code= True ) return  model. eval ( ) @app. post ( "/" ) 
async  def  create_item ( request:  Request) : global  model,  tokenizerjson_post_raw =  await  request. json( ) json_post =  json. dumps( json_post_raw) json_post_list =  json. loads( json_post) prompt =  json_post_list. get( 'prompt' ) eos_token_id =  tokenizer. convert_tokens_to_ids( [ "```" ] ) [ 0 ] inputs =  tokenizer( prompt,  return_tensors= "pt" ) . to( "cuda" ) generated_ids =  model. generate( ** inputs, num_return_sequences= 1 , eos_token_id= eos_token_id, pad_token_id= eos_token_id, max_new_tokens= 400 , do_sample= False , num_beams= 5 ) outputs =  tokenizer. batch_decode( generated_ids,  skip_special_tokens= True ) response =  outputs[ 0 ] . split( "```sql" ) [ - 1 ] . split( "```" ) [ 0 ] . split( ";" ) [ 0 ] . strip( )  +  ";" now =  datetime. datetime. now( ) time =  now. strftime( "%Y-%m-%d %H:%M:%S" ) answer =  { "response" :  response, "status" :  200 , "time" :  time} torch_gc( model. device) return  answerdef  api_start ( config) : model_path =  "CodeModels/sqlcoder/" global  tokenizerglobal  modeltokenizer =  AutoTokenizer. from_pretrained( model_path,  trust_remote_code= True ) model =  device( config,  model_path) uvicorn. run( app,  host= "0.0.0.0" ,  port= 7864 ,  workers= 1 ) 
curl - X POST "http://127.0.0.1:7864 -H 'Content-Type: application/json' -d '{" prompt": " 你的名字是"} '